35 files changed, 1580 insertions, 486 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..ff4dc02ce170 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
 obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..b756f527497e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
 }
 /*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
 * it with file (if non-NULL).
 *
 * NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
        }
 }
-/*
+/**
- *  sys_acct() is the only system call needed to implement process
+ * sys_acct - enable/disable process accounting
- *  accounting. It takes the name of the file where accounting records
+ * @name: file name for accounting records or NULL to shutdown accounting
- *  should be written. If the filename is NULL, accounting will be
+ *
- *  shutdown.
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
 */
 asmlinkage long sys_acct(const char __user *name)
 {
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
                        return (PTR_ERR(tmp));
                }
                /* Difference from BSD - they don't do O_APPEND */
-                file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+                file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
                putname(tmp);
                if (IS_ERR(file)) {
                        return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
        return (0);
 }
-/*
+/**
- * If the accouting is turned on for a file in the filesystem pointed
+ * acct_auto_close - turn off a filesystem's accounting if it is on
- * to by sb, turn accouting off.
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
 */
 void acct_auto_close(struct super_block *sb)
 {
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
        set_fs(fs);
 }
-/*
+/**
 * acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
 */
 void acct_process(long exitcode)
 {
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
 }
-/*
+/**
- * acct_update_integrals
+ * acct_update_integrals - update mm integral fields in task_struct
- *    -  update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
 */
 void acct_update_integrals(struct task_struct *tsk)
 {
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
        }
 }
-/*
+/**
- * acct_clear_integrals
+ * acct_clear_integrals - clear the mm integral fields in task_struct
- *    - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
 */
 void acct_clear_integrals(struct task_struct *tsk)
 {
diff --git a/kernel/audit.c b/kernel/audit.c
index 8376ec10cf24..83096b67510a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -513,7 +513,8 @@ static int __init audit_init(void)
 {
        printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
               audit_default ? "enabled" : "disabled");
-        audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+        audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+                                           THIS_MODULE);
        if (!audit_sock)
                audit_panic("cannot initialize netlink socket");
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
        if (!time_after(expire, now))
                return 0;
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire - now);
-        expire = schedule_timeout(expire - now);
        if (expire == 0)
                return 0;
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
                return -EINVAL;
        expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire);
-        expire = schedule_timeout(expire);
        if (expire == 0)
                return 0;
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
                        recalc_sigpending();
                        spin_unlock_irq(&current->sighand->siglock);
-                        current->state = TASK_INTERRUPTIBLE;
+                        timeout = schedule_timeout_interruptible(timeout);
-                        timeout = schedule_timeout(timeout);
                        spin_lock_irq(&current->sighand->siglock);
                        sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..79866bc6b3a1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
 */
 static DECLARE_MUTEX(cpuset_sem);
+static struct task_struct *cpuset_sem_owner;
+static int cpuset_sem_depth;
+/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem.  Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+static inline void cpuset_down(struct semaphore *psem)
+{
+        if (cpuset_sem_owner != current) {
+                down(psem);
+                cpuset_sem_owner = current;
+        }
+        cpuset_sem_depth++;
+}
+static inline void cpuset_up(struct semaphore *psem)
+{
+        if (--cpuset_sem_depth == 0) {
+                cpuset_sem_owner = NULL;
+                up(psem);
+        }
+}
 /*
 * A couple of forward declarations required, due to cyclic reference loop:
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 * Refresh current tasks mems_allowed and mems_generation from
 * current tasks cpuset.  Call with cpuset_sem held.
 *
- * Be sure to call refresh_mems() on any cpuset operation which
+ * This routine is needed to update the per-task mems_allowed
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * data, within the tasks context, when it is trying to allocate
- * Call after obtaining cpuset_sem lock, before any possible
+ * memory (in various mm/mempolicy.c routines) and notices
- * allocation.  Otherwise one risks trying to allocate memory
+ * that some other task has been modifying its cpuset.
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
 */
 static void refresh_mems(void)
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
-/*
- * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
- * Disable letting 'cpu_exclusive' cpusets define dynamic sched
- * domains, until the sched domain can handle partial nodes.
- * Remove this #if hackery when sched domains fixed.
- */
-#if 0
 static void update_cpu_domains(struct cpuset *cur)
 {
        struct cpuset *c, *par = cur->parent;
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur)
        partition_sched_domains(&pspan, &cspan);
        unlock_cpu_hotplug();
 }
-#else
-static void update_cpu_domains(struct cpuset *cur)
-{
-}
-#endif
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        }
        buffer[nbytes] = 0;     /* nul-terminate */
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        if (is_removed(cs)) {
                retval = -ENODEV;
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
        if (retval == 0)
                retval = nbytes;
 out2:
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        cpuset_release_agent(pathbuf);
 out1:
        kfree(buffer);
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
        cpumask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        mask = cs->cpus_allowed;
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        mask = cs->mems_allowed;
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -984,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
        *s++ = '\n';
        *s = '\0';
+        /* Do nothing if *ppos is at the eof or beyond the eof. */
+        if (s - page <= *ppos)
+                return 0;
        start = page + *ppos;
        n = s - start;
        retval = n - copy_to_user(buf, start, min(n, nbytes));
@@ -1342,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
        if (!cs)
                return -ENOMEM;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
-        refresh_mems();
        cs->flags = 0;
        if (notify_on_release(parent))
                set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1368,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
         * will down() this new directory's i_sem and if we race with
         * another mkdir, we might deadlock.
         */
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        err = cpuset_populate_dir(cs->dentry);
        /* If err < 0, we have a half-filled directory - oh well ;) */
        return 0;
 err:
        list_del(&cs->sibling);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        kfree(cs);
        return err;
 }
@@ -1397,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        /* the vfs holds both inode->i_sem already */
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
-        refresh_mems();
        if (atomic_read(&cs->count) > 0) {
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                return -EBUSY;
        }
        if (!list_empty(&cs->children)) {
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                return -EBUSY;
        }
        parent = cs->parent;
@@ -1420,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
        spin_unlock(&d->d_lock);
        cpuset_d_remove_dir(d);
        dput(d);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        cpuset_release_agent(pathbuf);
        return 0;
 }
@@ -1523,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk)
        if (notify_on_release(cs)) {
                char *pathbuf = NULL;
-                down(&cpuset_sem);
+                cpuset_down(&cpuset_sem);
                if (atomic_dec_and_test(&cs->count))
                        check_for_release(cs, &pathbuf);
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
                cpuset_release_agent(pathbuf);
        } else {
                atomic_dec(&cs->count);
@@ -1547,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
        cpumask_t mask;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        task_lock((struct task_struct *)tsk);
        guarantee_online_cpus(tsk->cpuset, &mask);
        task_unlock((struct task_struct *)tsk);
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        return mask;
 }
@@ -1576,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void)
        if (!cs)
                return;         /* task is exiting */
        if (current->cpuset_mems_generation != cs->mems_generation) {
-                down(&cpuset_sem);
+                cpuset_down(&cpuset_sem);
                refresh_mems();
-                up(&cpuset_sem);
+                cpuset_up(&cpuset_sem);
        }
 }
@@ -1611,17 +1628,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
        return 0;
 }
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+        while (!is_mem_exclusive(cs) && cs->parent)
+                cs = cs->parent;
+        return cs;
+}
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
- * @z: zone in question
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
 *
- * Is zone z allowed in current->mems_allowed, or is
+ * If we're in interrupt, yes, we can always allocate.  If zone
- * the CPU in interrupt context? (zone is always allowed in this case)
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
- */
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
-int cpuset_zone_allowed(struct zone *z)
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *      in_interrupt - any node ok (current task context irrelevant)
+ *      GFP_ATOMIC   - any node ok
+ *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
+{
+        int node;                       /* node that zone z is on */
+        const struct cpuset *cs;        /* current cpuset ancestors */
+        int allowed = 1;                /* is allocation in zone z allowed? */
+        if (in_interrupt())
+                return 1;
+        node = z->zone_pgdat->node_id;
+        if (node_isset(node, current->mems_allowed))
+                return 1;
+        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
+                return 0;
+        /* Not hardwall and node outside mems_allowed: scan up cpusets */
+        cpuset_down(&cpuset_sem);
+        cs = current->cpuset;
+        if (!cs)
+                goto done;              /* current task exiting */
+        cs = nearest_exclusive_ancestor(cs);
+        allowed = node_isset(node, cs->mems_allowed);
+done:
+        cpuset_up(&cpuset_sem);
+        return allowed;
+}
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
 {
-        return in_interrupt() ||
+        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
-                node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+        int overlap = 0;                /* do cpusets overlap? */
+        cpuset_down(&cpuset_sem);
+        cs1 = current->cpuset;
+        if (!cs1)
+                goto done;              /* current task exiting */
+        cs2 = p->cpuset;
+        if (!cs2)
+                goto done;              /* task p is exiting */
+        cs1 = nearest_exclusive_ancestor(cs1);
+        cs2 = nearest_exclusive_ancestor(cs2);
+        overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+        cpuset_up(&cpuset_sem);
+        return overlap;
 }
 /*
@@ -1642,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
                return -ENOMEM;
        tsk = m->private;
-        down(&cpuset_sem);
+        cpuset_down(&cpuset_sem);
        task_lock(tsk);
        cs = tsk->cpuset;
        task_unlock(tsk);
@@ -1657,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
        seq_puts(m, buf);
        seq_putc(m, '\n');
 out:
-        up(&cpuset_sem);
+        cpuset_up(&cpuset_sem);
        kfree(buf);
        return retval;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f21..6d2089a1bce7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize);
 static inline void close_files(struct files_struct * files)
 {
        int i, j;
+        struct fdtable *fdt;
        j = 0;
+        fdt = files_fdtable(files);
        for (;;) {
                unsigned long set;
                i = j * __NFDBITS;
-                if (i >= files->max_fdset || i >= files->max_fds)
+                if (i >= fdt->max_fdset || i >= fdt->max_fds)
                        break;
-                set = files->open_fds->fds_bits[j++];
+                set = fdt->open_fds->fds_bits[j++];
                while (set) {
                        if (set & 1) {
-                                struct file * file = xchg(&files->fd[i], NULL);
+                                struct file * file = xchg(&fdt->fd[i], NULL);
                                if (file)
                                        filp_close(file, files);
                        }
@@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
 void fastcall put_files_struct(struct files_struct *files)
 {
+        struct fdtable *fdt;
        if (atomic_dec_and_test(&files->count)) {
                close_files(files);
                /*
                 * Free the fd and fdset arrays if we expanded them.
+                 * If the fdtable was embedded, pass files for freeing
+                 * at the end of the RCU grace period. Otherwise,
+                 * you can free files immediately.
                 */
-                if (files->fd != &files->fd_array[0])
+                fdt = files_fdtable(files);
-                        free_fd_array(files->fd, files->max_fds);
+                if (fdt == &files->fdtab)
-                if (files->max_fdset > __FD_SETSIZE) {
+                        fdt->free_files = files;
-                        free_fdset(files->open_fds, files->max_fdset);
+                else
-                        free_fdset(files->close_on_exec, files->max_fdset);
+                        kmem_cache_free(files_cachep, files);
-                }
+                free_fdtable(fdt);
-                kmem_cache_free(files_cachep, files);
        }
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..8149f3602881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
 #include <linux/syscalls.h>
 #include <linux/jiffies.h>
 #include <linux/futex.h>
+#include <linux/rcupdate.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        /* One for us, one for whoever does the "release_task()" (usually parent) */
        atomic_set(&tsk->usage,2);
+        atomic_set(&tsk->fs_excl, 0);
        return tsk;
 }
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
        return 0;
 }
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
 {
+        int size = fdt->max_fdset;
        int i;
        /* Find the last open fd */
        for (i = size/(8*sizeof(long)); i > 0; ) {
-                if (files->open_fds->fds_bits[--i])
+                if (fdt->open_fds->fds_bits[--i])
                        break;
        }
        i = (i+1) * 8 * sizeof(long);
        return i;
 }
+static struct files_struct *alloc_files(void)
+{
+        struct files_struct *newf;
+        struct fdtable *fdt;
+        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        if (!newf)
+                goto out;
+        atomic_set(&newf->count, 1);
+        spin_lock_init(&newf->file_lock);
+        fdt = &newf->fdtab;
+        fdt->next_fd = 0;
+        fdt->max_fds = NR_OPEN_DEFAULT;
+        fdt->max_fdset = __FD_SETSIZE;
+        fdt->close_on_exec = &newf->close_on_exec_init;
+        fdt->open_fds = &newf->open_fds_init;
+        fdt->fd = &newf->fd_array[0];
+        INIT_RCU_HEAD(&fdt->rcu);
+        fdt->free_files = NULL;
+        fdt->next = NULL;
+        rcu_assign_pointer(newf->fdt, fdt);
+out:
+        return newf;
+}
 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
        struct files_struct *oldf, *newf;
        struct file **old_fds, **new_fds;
        int open_files, size, i, error = 0, expand;
+        struct fdtable *old_fdt, *new_fdt;
        /*
         * A background process may not have any files ...
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
         */
        tsk->files = NULL;
        error = -ENOMEM;
-        newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+        newf = alloc_files();
-        if (!newf) 
+        if (!newf)
                goto out;
-        atomic_set(&newf->count, 1);
-        spin_lock_init(&newf->file_lock);
-        newf->next_fd       = 0;
-        newf->max_fds       = NR_OPEN_DEFAULT;
-        newf->max_fdset     = __FD_SETSIZE;
-        newf->close_on_exec = &newf->close_on_exec_init;
-        newf->open_fds      = &newf->open_fds_init;
-        newf->fd            = &newf->fd_array[0];
        spin_lock(&oldf->file_lock);
+        old_fdt = files_fdtable(oldf);
-        open_files = count_open_files(oldf, oldf->max_fdset);
+        new_fdt = files_fdtable(newf);
+        size = old_fdt->max_fdset;
+        open_files = count_open_files(old_fdt);
        expand = 0;
        /*
         * Check whether we need to allocate a larger fd array or fd set.
         * Note: we're not a clone task, so the open count won't  change.
         */
-        if (open_files > newf->max_fdset) {
+        if (open_files > new_fdt->max_fdset) {
-                newf->max_fdset = 0;
+                new_fdt->max_fdset = 0;
                expand = 1;
        }
-        if (open_files > newf->max_fds) {
+        if (open_files > new_fdt->max_fds) {
-                newf->max_fds = 0;
+                new_fdt->max_fds = 0;
                expand = 1;
        }
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
                spin_unlock(&newf->file_lock);
                if (error < 0)
                        goto out_release;
+                new_fdt = files_fdtable(newf);
+                /*
+                 * Reacquire the oldf lock and a pointer to its fd table
+                 * who knows it may have a new bigger fd table. We need
+                 * the latest pointer.
+                 */
                spin_lock(&oldf->file_lock);
+                old_fdt = files_fdtable(oldf);
        }
-        old_fds = oldf->fd;
+        old_fds = old_fdt->fd;
-        new_fds = newf->fd;
+        new_fds = new_fdt->fd;
-        memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
+        memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
-        memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+        memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
        for (i = open_files; i != 0; i--) {
                struct file *f = *old_fds++;
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
                         * is partway through open().  So make sure that this
                         * fd is available to the new process.
                         */
-                        FD_CLR(open_files - i, newf->open_fds);
+                        FD_CLR(open_files - i, new_fdt->open_fds);
                }
-                *new_fds++ = f;
+                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);
        /* compute the remainder to be cleared */
-        size = (newf->max_fds - open_files) * sizeof(struct file *);
+        size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
        /* This is long word aligned thus could use a optimized version */ 
        memset(new_fds, 0, size); 
-        if (newf->max_fdset > open_files) {
+        if (new_fdt->max_fdset > open_files) {
-                int left = (newf->max_fdset-open_files)/8;
+                int left = (new_fdt->max_fdset-open_files)/8;
                int start = open_files / (8 * sizeof(unsigned long));
-                memset(&newf->open_fds->fds_bits[start], 0, left);
+                memset(&new_fdt->open_fds->fds_bits[start], 0, left);
-                memset(&newf->close_on_exec->fds_bits[start], 0, left);
+                memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
        }
        tsk->files = newf;
@@ -688,9 +718,9 @@ out:
        return error;
 out_release:
-        free_fdset (newf->close_on_exec, newf->max_fdset);
+        free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
-        free_fdset (newf->open_fds, newf->max_fdset);
+        free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
-        free_fd_array(newf->fd, newf->max_fds);
+        free_fd_array(new_fdt->fd, new_fdt->max_fds);
        kmem_cache_free(files_cachep, newf);
        goto out;
 }
@@ -994,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags,
         * of CLONE_PTRACE.
         */
        clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
        /* Our parent execution domain becomes current domain
           These must match for thread signalling to apply */
@@ -1112,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags,
                        __get_cpu_var(process_counts)++;
        }
+        if (!current->signal->tty && p->signal->tty)
+                p->signal->tty = NULL;
        nr_threads++;
        total_forks++;
        write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
@@ -327,6 +328,118 @@ out:
 }
 /*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+        union futex_key key1, key2;
+        struct futex_hash_bucket *bh1, *bh2;
+        struct list_head *head;
+        struct futex_q *this, *next;
+        int ret, op_ret, attempt = 0;
+retryfull:
+        down_read(&current->mm->mmap_sem);
+        ret = get_futex_key(uaddr1, &key1);
+        if (unlikely(ret != 0))
+                goto out;
+        ret = get_futex_key(uaddr2, &key2);
+        if (unlikely(ret != 0))
+                goto out;
+        bh1 = hash_futex(&key1);
+        bh2 = hash_futex(&key2);
+retry:
+        if (bh1 < bh2)
+                spin_lock(&bh1->lock);
+        spin_lock(&bh2->lock);
+        if (bh1 > bh2)
+                spin_lock(&bh1->lock);
+        op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+        if (unlikely(op_ret < 0)) {
+                int dummy;
+                spin_unlock(&bh1->lock);
+                if (bh1 != bh2)
+                        spin_unlock(&bh2->lock);
+                /* futex_atomic_op_inuser needs to both read and write
+                 * *(int __user *)uaddr2, but we can't modify it
+                 * non-atomically.  Therefore, if get_user below is not
+                 * enough, we need to handle the fault ourselves, while
+                 * still holding the mmap_sem.  */
+                if (attempt++) {
+                        struct vm_area_struct * vma;
+                        struct mm_struct *mm = current->mm;
+                        ret = -EFAULT;
+                        if (attempt >= 2 ||
+                            !(vma = find_vma(mm, uaddr2)) ||
+                            vma->vm_start > uaddr2 ||
+                            !(vma->vm_flags & VM_WRITE))
+                                goto out;
+                        switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+                        case VM_FAULT_MINOR:
+                                current->min_flt++;
+                                break;
+                        case VM_FAULT_MAJOR:
+                                current->maj_flt++;
+                                break;
+                        default:
+                                goto out;
+                        }
+                        goto retry;
+                }
+                /* If we would have faulted, release mmap_sem,
+                 * fault it in and start all over again.  */
+                up_read(&current->mm->mmap_sem);
+                ret = get_user(dummy, (int __user *)uaddr2);
+                if (ret)
+                        return ret;
+                goto retryfull;
+        }
+        head = &bh1->chain;
+        list_for_each_entry_safe(this, next, head, list) {
+                if (match_futex (&this->key, &key1)) {
+                        wake_futex(this);
+                        if (++ret >= nr_wake)
+                                break;
+                }
+        }
+        if (op_ret > 0) {
+                head = &bh2->chain;
+                op_ret = 0;
+                list_for_each_entry_safe(this, next, head, list) {
+                        if (match_futex (&this->key, &key2)) {
+                                wake_futex(this);
+                                if (++op_ret >= nr_wake2)
+                                        break;
+                        }
+                }
+                ret += op_ret;
+        }
+        spin_unlock(&bh1->lock);
+        if (bh1 != bh2)
+                spin_unlock(&bh2->lock);
+out:
+        up_read(&current->mm->mmap_sem);
+        return ret;
+}
+/*
 * Requeue all waiters hashed on one physical page to another
 * physical page.
 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
        if (signal) {
-                int err;
                err = f_setown(filp, current->pid, 1);
                if (err < 0) {
-                        put_unused_fd(ret);
+                        goto error;
-                        put_filp(filp);
-                        ret = err;
-                        goto out;
                }
                filp->f_owner.signum = signal;
        }
        q = kmalloc(sizeof(*q), GFP_KERNEL);
        if (!q) {
-                put_unused_fd(ret);
+                err = -ENOMEM;
-                put_filp(filp);
+                goto error;
-                ret = -ENOMEM;
-                goto out;
        }
        down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
        if (unlikely(err != 0)) {
                up_read(&current->mm->mmap_sem);
-                put_unused_fd(ret);
-                put_filp(filp);
                kfree(q);
-                return err;
+                goto error;
        }
        /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
        fd_install(ret, filp);
 out:
        return ret;
+error:
+        put_unused_fd(ret);
+        put_filp(filp);
+        ret = err;
+        goto out;
 }
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
        case FUTEX_CMP_REQUEUE:
                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
                break;
+        case FUTEX_WAKE_OP:
+                ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+                break;
        default:
                ret = -ENOSYS;
        }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
        struct list_head *tmp;
        struct inter_module_entry *ime, *ime_new;
-        if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+        if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
                /* Overloaded kernel, not fatal */
                printk(KERN_ERR
                        "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
                kmalloc_failed = 1;
                return;
        }
-        memset(ime_new, 0, sizeof(*ime_new));
        ime_new->im_name = im_name;
        ime_new->owner = owner;
        ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
        unsigned int status;
        kstat_this_cpu.irqs[irq]++;
-        if (desc->status & IRQ_PER_CPU) {
+        if (CHECK_IRQ_PER_CPU(desc->status)) {
                irqreturn_t action_ret;
                /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-void __attribute__((weak))
+#ifdef CONFIG_GENERIC_PENDING_IRQ
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+        /*
+         * Save these away for later use. Re-progam when the
+         * interrupt is pending
+         */
+        set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
        irq_affinity[irq] = mask_val;
        irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
 * get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
        return kip->insns;
 }
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we take the kprobe_lock
+         * and before we get a chance to update kprobe_cpu, this to prevent
+         * deadlock when we have a kprobe on ISR routine and a kprobe on task
+         * routine
+         */
+        local_irq_save(flags);
        spin_lock(&kprobe_lock);
        kprobe_cpu = smp_processor_id();
+        local_irq_restore(flags);
 }
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we update
+         * kprobe_cpu and before we get a a chance to release kprobe_lock,
+         * this to prevent deadlock when we have a kprobe on ISR routine and
+         * a kprobe on task routine
+         */
+        local_irq_save(flags);
        kprobe_cpu = NR_CPUS;
        spin_unlock(&kprobe_lock);
+        local_irq_restore(flags);
 }
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-                              unsigned long flags)
+                                        unsigned long flags)
 {
        struct kprobe *kp;
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
        return;
 }
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-                              int trapnr)
+                                        int trapnr)
 {
        /*
         * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
        return 0;
 }
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp = curr_kprobe;
        if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+                                                              *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
        /*
         * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
                kfree(ri);
 }
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
 * instances associated with this task. These left over instances represent
 * probed functions that have been called but will never return.
 */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
        struct kprobe *kp;
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * the intricacies
 * TODO: Move kcalloc outside the spinlock
 */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+                                          struct kprobe *p)
 {
        int ret = 0;
        struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+        if (addr >= (unsigned long)__kprobes_text_start
+                && addr < (unsigned long)__kprobes_text_end)
+                return -EINVAL;
+        return 0;
+}
+int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
        unsigned long flags = 0;
        struct kprobe *old_p;
-        if ((ret = arch_prepare_kprobe(p)) != 0) {
+        if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+                return ret;
+        if ((ret = arch_prepare_kprobe(p)) != 0)
                goto rm_kprobe;
-        }
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
        p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
        return ret;
 }
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
        unsigned long flags;
        struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
        .priority = 0x7fffffff /* we need to notified first */
 };
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
        /* Todo: Verify probepoint is a function entry point */
        jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
        return register_kprobe(&jp->kp);
 }
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
        unregister_kprobe(&jp->kp);
 }
 #ifdef ARCH_SUPPORTS_KRETPROBES
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
        struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
 #else /* ARCH_SUPPORTS_KRETPROBES */
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        return -ENOSYS;
 }
 #endif /* ARCH_SUPPORTS_KRETPROBES */
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
        unsigned long flags;
        struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/elf.h>
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags)
 {
        int ret = (flags & O_TRUNC);
        if (ret)
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        return ret;
 }
 #else
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
        if (!(tainted & TAINT_FORCED_MODULE)) {
                printk("%s: no version for \"%s\" found: kernel tainted.\n",
                       mod->name, symname);
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        }
        return 1;
 }
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
        if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
                printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
                       mod->name, license);
-                tainted |= TAINT_PROPRIETARY_MODULE;
+                add_taint(TAINT_PROPRIETARY_MODULE);
        }
 }
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod,
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
        struct exception_table_entry *extable;
+        mm_segment_t old_fs;
        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod,
        modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
        /* This is allowed: modprobe --force will invalidate it. */
        if (!modmagic) {
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
                printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
                       mod->name);
        } else if (!same_magic(modmagic, vermagic)) {
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod,
            (mod->num_gpl_syms && !gplcrcindex)) {
                printk(KERN_WARNING "%s: No versions for exported symbols."
                       " Tainting kernel.\n", mod->name);
-                tainted |= TAINT_FORCED_MODULE;
+                add_taint(TAINT_FORCED_MODULE);
        }
 #endif
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod,
        if (err < 0)
                goto cleanup;
+        /* flush the icache in correct context */
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        /*
+         * Flush the instruction cache, since we've played with text.
+         * Do it before processing of module parameters, so the module
+         * can provide parameter accessor functions of its own.
+         */
+        if (mod->module_init)
+                flush_icache_range((unsigned long)mod->module_init,
+                                   (unsigned long)mod->module_init
+                                   + mod->init_size);
+        flush_icache_range((unsigned long)mod->module_core,
+                           (unsigned long)mod->module_core + mod->core_size);
+        set_fs(old_fs);
        mod->args = args;
        if (obsparmindex) {
                err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod,
                const char __user *uargs)
 {
        struct module *mod;
-        mm_segment_t old_fs = get_fs();
        int ret = 0;
        /* Must have permission */
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod,
                return PTR_ERR(mod);
        }
-        /* flush the icache in correct context */
-        set_fs(KERNEL_DS);
-        /* Flush the instruction cache, since we've played with text */
-        if (mod->module_init)
-                flush_icache_range((unsigned long)mod->module_init,
-                                   (unsigned long)mod->module_init
-                                   + mod->init_size);
-        flush_icache_range((unsigned long)mod->module_core,
-                           (unsigned long)mod->module_core + mod->core_size);
-        set_fs(old_fs);
        /* Now sew it into the lists.  They won't access us, since
           strong_try_module_get() will fail. */
        stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
        struct module_kobject *mk;
-        mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
+        mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
-        memset(mk, 0, sizeof(struct module_kobject));
+        BUG_ON(!mk);
        mk->mod = THIS_MODULE;
        kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
        if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-                if (unlikely(timr->it_process->flags & PF_EXITING)) {
+                struct task_struct *leader;
-                        timr->it_sigev_notify = SIGEV_SIGNAL;
+                int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        put_task_struct(timr->it_process);
+                                        timr->it_process);
-                        timr->it_process = timr->it_process->group_leader;
-                        goto group;
+                if (likely(ret >= 0))
-                }
+                        return ret;
-                return send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
+                timr->it_sigev_notify = SIGEV_SIGNAL;
-        }
+                leader = timr->it_process->group_leader;
-        else {
+                put_task_struct(timr->it_process);
-        group:
+                timr->it_process = leader;
-                return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
        }
+        return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+                                   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2c7121d9bff1..396c7873e804 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
 config PM
        bool "Power Management support"
+        depends on !IA64_HP_SIM
        ---help---
          "Power Management" means that parts of your computer are shut
          off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+        depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need APM.
@@ -72,6 +73,18 @@ config PM_STD_PARTITION
          suspended image to. It will simply pick the first available swap 
          device.
+config SWSUSP_ENCRYPT
+        bool "Encrypt suspend image"
+        depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
+        default ""
+        ---help---
+          To prevent data gathering from swap after resume you can encrypt
+          the suspend image with a temporary key that is deleted on
+          resume.
+          Note that the temporary key is stored unencrypted on disk while the
+          system is suspended.
 config SUSPEND_SMP
        bool
        depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 664eb0469b6e..2d8bf054d036 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -112,24 +112,12 @@ static inline void platform_finish(void)
        }
 }
-static void finish(void)
-{
-        device_resume();
-        platform_finish();
-        thaw_processes();
-        enable_nonboot_cpus();
-        pm_restore_console();
-}
 static int prepare_processes(void)
 {
        int error;
        pm_prepare_console();
        sys_sync();
        disable_nonboot_cpus();
        if (freeze_processes()) {
@@ -162,15 +150,6 @@ static void unprepare_processes(void)
        pm_restore_console();
 }
-static int prepare_devices(void)
-{
-        int error;
-        if ((error = device_suspend(PMSG_FREEZE)))
-                printk("Some devices failed to suspend\n");
-        return error;
-}
 /**
 *      pm_suspend_disk - The granpappy of power management.
 *
@@ -187,17 +166,14 @@ int pm_suspend_disk(void)
        error = prepare_processes();
        if (error)
                return error;
-        error = prepare_devices();
+        error = device_suspend(PMSG_FREEZE);
        if (error) {
+                printk("Some devices failed to suspend\n");
                unprepare_processes();
                return error;
        }
-        pr_debug("PM: Attempting to suspend to disk.\n");
-        if (pm_disk_mode == PM_DISK_FIRMWARE)
-                return pm_ops->enter(PM_SUSPEND_DISK);
        pr_debug("PM: snapshotting memory.\n");
        in_suspend = 1;
        if ((error = swsusp_suspend()))
@@ -208,11 +184,20 @@ int pm_suspend_disk(void)
                error = swsusp_write();
                if (!error)
                        power_down(pm_disk_mode);
+                else {
+                /* swsusp_write can not fail in device_resume,
+                   no need to do second device_resume */
+                        swsusp_free();
+                        unprepare_processes();
+                        return error;
+                }
        } else
                pr_debug("PM: Image restored successfully.\n");
        swsusp_free();
 Done:
-        finish();
+        device_resume();
+        unprepare_processes();
        return error;
 }
@@ -233,9 +218,12 @@ static int software_resume(void)
 {
        int error;
+        down(&pm_sem);
        if (!swsusp_resume_device) {
-                if (!strlen(resume_file))
+                if (!strlen(resume_file)) {
+                        up(&pm_sem);
                        return -ENOENT;
+                }
                swsusp_resume_device = name_to_dev_t(resume_file);
                pr_debug("swsusp: Resume From Partition %s\n", resume_file);
        } else {
@@ -248,6 +236,7 @@ static int software_resume(void)
                 * FIXME: If noresume is specified, we need to find the partition
                 * and reset it back to normal swap space.
                 */
+                up(&pm_sem);
                return 0;
        }
@@ -270,20 +259,24 @@ static int software_resume(void)
        pr_debug("PM: Preparing devices for restore.\n");
-        if ((error = prepare_devices()))
+        if ((error = device_suspend(PMSG_FREEZE))) {
+                printk("Some devices failed to suspend\n");
                goto Free;
+        }
        mb();
        pr_debug("PM: Restoring saved image.\n");
        swsusp_resume();
        pr_debug("PM: Restore failed, recovering.n");
-        finish();
+        device_resume();
 Free:
        swsusp_free();
 Cleanup:
        unprepare_processes();
 Done:
+        /* For success case, the suspend path will release the lock */
+        up(&pm_sem);
        pr_debug("PM: Resume from disk failed.\n");
        return 0;
 }
@@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
        if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
                res = MKDEV(maj,min);
                if (maj == MAJOR(res) && min == MINOR(res)) {
+                        down(&pm_sem);
                        swsusp_resume_device = res;
+                        up(&pm_sem);
                        printk("Attempting manual resume\n");
                        noresume = 0;
                        software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 71aa0fd22007..22bdc93cc038 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
-static char * pm_states[] = {
+static char *pm_states[PM_SUSPEND_MAX] = {
        [PM_SUSPEND_STANDBY]    = "standby",
        [PM_SUSPEND_MEM]        = "mem",
+#ifdef CONFIG_SOFTWARE_SUSPEND
        [PM_SUSPEND_DISK]       = "disk",
-        NULL,
+#endif
 };
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
                           unsigned long id,
                           pm_callback callback)
 {
-        struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+        struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
        if (dev) {
-                memset(dev, 0, sizeof(*dev));
                dev->type = type;
                dev->id = id;
                dev->callback = callback;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3bd0d261818f..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
           processes around? */
        long save;
        save = current->state;
-        current->state = TASK_UNINTERRUPTIBLE;
        pr_debug("%s entered refrigerator\n", current->comm);
        printk("=");
@@ -47,8 +46,10 @@ void refrigerator(void)
        recalc_sigpending(); /* We sent fake signal, clean it up */
        spin_unlock_irq(&current->sighand->siglock);
-        while (frozen(current))
+        while (frozen(current)) {
+                current->state = TASK_UNINTERRUPTIBLE;
                schedule();
+        }
        pr_debug("%s left refrigerator\n", current->comm);
        current->state = save;
 }
@@ -80,13 +81,33 @@ int freeze_processes(void)
                } while_each_thread(g, p);
                read_unlock(&tasklist_lock);
                yield();                        /* Yield is okay here */
-                if (time_after(jiffies, start_time + TIMEOUT)) {
+                if (todo && time_after(jiffies, start_time + TIMEOUT)) {
                        printk( "\n" );
                        printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
-                        return todo;
+                        break;
                }
        } while(todo);
+        /* This does not unfreeze processes that are already frozen
+         * (we have slightly ugly calling convention in that respect,
+         * and caller must call thaw_processes() if something fails),
+         * but it cleans up leftover PF_FREEZE requests.
+         */
+        if (todo) {
+                read_lock(&tasklist_lock);
+                do_each_thread(g, p)
+                        if (freezing(p)) {
+                                pr_debug("  clean up: %s\n", p->comm);
+                                p->flags &= ~PF_FREEZE;
+                                spin_lock_irqsave(&p->sighand->siglock, flags);
+                                recalc_sigpending_tsk(p);
+                                spin_unlock_irqrestore(&p->sighand->siglock, flags);
+                        }
+                while_each_thread(g, p);
+                read_unlock(&tasklist_lock);
+                return todo;
+        }
        printk( "|\n" );
        BUG_ON(in_atomic());
        return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f2bc71b9fe8b..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,6 +31,9 @@
 * Alex Badea <vampire@go.ro>:
 * Fixed runaway init
 *
+ * Andreas Steinmetz <ast@domdv.de>:
+ * Added encrypted suspend option
+ *
 * More state savers are welcome. Especially for the scsi layer...
 *
 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -71,8 +74,16 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
 #include "power.h"
+#define CIPHER "aes"
+#define MAXKEY 32
+#define MAXIV  32
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
@@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save;
 #define SWSUSP_SIG      "S1SUSPEND"
 static struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+        char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
+        u8 key_iv[MAXKEY+MAXIV];
        swp_entry_t swsusp_info;
        char    orig_sig[10];
        char    sig[10];
@@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
+static int write_page(unsigned long addr, swp_entry_t * loc);
+static int bio_read_page(pgoff_t page_off, void * page);
+static u8 key_iv[MAXKEY+MAXIV];
+#ifdef CONFIG_SWSUSP_ENCRYPT
+static int crypto_init(int mode, void **mem)
+{
+        int error = 0;
+        int len;
+        char *modemsg;
+        struct crypto_tfm *tfm;
+        modemsg = mode ? "suspend not possible" : "resume not possible";
+        tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
+        if(!tfm) {
+                printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
+                error = -EINVAL;
+                goto out;
+        }
+        if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
+                printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
+                error = -ENOKEY;
+                goto fail;
+        }
+        if (mode)
+                get_random_bytes(key_iv, MAXKEY+MAXIV);
+        len = crypto_tfm_alg_max_keysize(tfm);
+        if (len > MAXKEY)
+                len = MAXKEY;
+        if (crypto_cipher_setkey(tfm, key_iv, len)) {
+                printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
+                error = -EKEYREJECTED;
+                goto fail;
+        }
+        len = crypto_tfm_alg_ivsize(tfm);
+        if (MAXIV < len) {
+                printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
+                error = -EOVERFLOW;
+                goto fail;
+        }
+        crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
+        *mem=(void *)tfm;
+        goto out;
+fail:   crypto_free_tfm(tfm);
+out:    return error;
+}
+static __inline__ void crypto_exit(void *mem)
+{
+        crypto_free_tfm((struct crypto_tfm *)mem);
+}
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+        int error = 0;
+        struct scatterlist src, dst;
+        src.page   = virt_to_page(p->address);
+        src.offset = 0;
+        src.length = PAGE_SIZE;
+        dst.page   = virt_to_page((void *)&swsusp_header);
+        dst.offset = 0;
+        dst.length = PAGE_SIZE;
+        error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
+                                        PAGE_SIZE);
+        if (!error)
+                error = write_page((unsigned long)&swsusp_header,
+                                &(p->swap_address));
+        return error;
+}
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+        int error = 0;
+        struct scatterlist src, dst;
+        error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+        if (!error) {
+                src.offset = 0;
+                src.length = PAGE_SIZE;
+                dst.offset = 0;
+                dst.length = PAGE_SIZE;
+                src.page = dst.page = virt_to_page((void *)p->address);
+                error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
+                                                &src, PAGE_SIZE);
+        }
+        return error;
+}
+#else
+static __inline__ int crypto_init(int mode, void *mem)
+{
+        return 0;
+}
+static __inline__ void crypto_exit(void *mem)
+{
+}
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+        return write_page(p->address, &(p->swap_address));
+}
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+        return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+}
+#endif
 static int mark_swapfiles(swp_entry_t prev)
 {
        int error;
@@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev)
            !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+                memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
                swsusp_header.swsusp_info = prev;
                error = rw_swap_page_sync(WRITE,
                                          swp_entry(root_swap, 0),
@@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
        len=strlen(resume_file);
        root_swap = 0xFFFF;
-        swap_list_lock();
+        spin_lock(&swap_lock);
        for (i=0; i<MAX_SWAPFILES; i++) {
-                if (swap_info[i].flags == 0) {
+                if (!(swap_info[i].flags & SWP_WRITEOK)) {
                        swapfile_used[i]=SWAPFILE_UNUSED;
                } else {
                        if (!len) {
@@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
                        }
                }
        }
-        swap_list_unlock();
+        spin_unlock(&swap_lock);
        return (root_swap != 0xffff) ? 0 : -ENODEV;
 }
@@ -216,12 +354,12 @@ static void lock_swapdevices(void)
 {
        int i;
-        swap_list_lock();
+        spin_lock(&swap_lock);
        for (i = 0; i< MAX_SWAPFILES; i++)
                if (swapfile_used[i] == SWAPFILE_IGNORED) {
-                        swap_info[i].flags ^= 0xFF;
+                        swap_info[i].flags ^= SWP_WRITEOK;
                }
-        swap_list_unlock();
+        spin_unlock(&swap_lock);
 }
 /**
@@ -286,6 +424,10 @@ static int data_write(void)
        int error = 0, i = 0;
        unsigned int mod = nr_copy_pages / 100;
        struct pbe *p;
+        void *tfm;
+        if ((error = crypto_init(1, &tfm)))
+                return error;
        if (!mod)
                mod = 1;
@@ -294,11 +436,14 @@ static int data_write(void)
        for_each_pbe (p, pagedir_nosave) {
                if (!(i%mod))
                        printk( "\b\b\b\b%3d%%", i / mod );
-                if ((error = write_page(p->address, &(p->swap_address))))
+                if ((error = crypto_write(p, tfm))) {
+                        crypto_exit(tfm);
                        return error;
+                }
                i++;
        }
        printk("\b\b\b\bdone\n");
+        crypto_exit(tfm);
        return error;
 }
@@ -385,7 +530,6 @@ static int write_pagedir(void)
 *      write_suspend_image - Write entire image and metadata.
 *
 */
 static int write_suspend_image(void)
 {
        int error;
@@ -400,6 +544,7 @@ static int write_suspend_image(void)
        if ((error = close_swap()))
                goto FreePagedir;
 Done:
+        memset(key_iv, 0, MAXKEY+MAXIV);
        return error;
 FreePagedir:
        free_pagedir_entries();
@@ -591,18 +736,7 @@ static void copy_data_pages(void)
 static int calc_nr(int nr_copy)
 {
-        int extra = 0;
+        return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
-        int mod = !!(nr_copy % PBES_PER_PAGE);
-        int diff = (nr_copy / PBES_PER_PAGE) + mod;
-        do {
-                extra += diff;
-                nr_copy += diff;
-                mod = !!(nr_copy % PBES_PER_PAGE);
-                diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
-        } while (diff > 0);
-        return nr_copy;
 }
 /**
@@ -886,20 +1020,21 @@ int swsusp_suspend(void)
         * at resume time, and evil weirdness ensues.
         */
        if ((error = device_power_down(PMSG_FREEZE))) {
+                printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
                local_irq_enable();
                return error;
        }
        if ((error = swsusp_swap_check())) {
-                printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
+                printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
-                                "swapon -a!\n");
+                device_power_up();
                local_irq_enable();
                return error;
        }
        save_processor_state();
        if ((error = swsusp_arch_suspend()))
-                printk("Error %d suspending\n", error);
+                printk(KERN_ERR "Error %d suspending\n", error);
        /* Restore control flow magically appears here */
        restore_processor_state();
        BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -924,6 +1059,7 @@ int swsusp_resume(void)
        BUG_ON(!error);
        restore_processor_state();
        restore_highmem();
+        touch_softlockup_watchdog();
        device_power_up();
        local_irq_enable();
        return error;
@@ -1179,7 +1315,8 @@ static const char * sanity_check(void)
        if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
                return "machine";
 #if 0
-        if(swsusp_info.cpus != num_online_cpus())
+        /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
+        if (swsusp_info.cpus != num_possible_cpus())
                return "number of cpus";
 #endif
        return NULL;
@@ -1212,13 +1349,14 @@ static int check_sig(void)
                return error;
        if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
                memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+                memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
+                memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
                /*
                 * Reset swap signature now.
                 */
                error = bio_write_page(0, &swsusp_header);
        } else { 
-                printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
                return -EINVAL;
        }
        if (!error)
@@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist)
        int error = 0;
        int i = 0;
        int mod = swsusp_info.image_pages / 100;
+        void *tfm;
+        if ((error = crypto_init(0, &tfm)))
+                return error;
        if (!mod)
                mod = 1;
@@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist)
                if (!(i % mod))
                        printk("\b\b\b\b%3d%%", i / mod);
-                error = bio_read_page(swp_offset(p->swap_address),
+                if ((error = crypto_read(p, tfm))) {
-                                  (void *)p->address);
+                        crypto_exit(tfm);
-                if (error)
                        return error;
+                }
                i++;
        }
        printk("\b\b\b\bdone\n");
+        crypto_exit(tfm);
        return error;
 }
@@ -1385,6 +1528,7 @@ int swsusp_read(void)
        error = read_suspend_image();
        blkdev_put(resume_bdev);
+        memset(key_iv, 0, MAXKEY+MAXIV);
        if (!error)
                pr_debug("swsusp: Reading resume file was successful\n");
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
        return r;
 }
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
        unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        static char printk_buf[1024];
        static int log_level_unknown = 1;
-        if (unlikely(oops_in_progress))
+        preempt_disable();
+        if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+                /* If a crash is occurring during printk() on this CPU,
+                 * make sure we can't deadlock */
                zap_locks();
        /* This stops the holder of console_sem just where we want him */
        spin_lock_irqsave(&logbuf_lock, flags);
+        printk_cpu = smp_processor_id();
        /* Emit the output into the temporary buffer */
        printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * CPU until it is officially up.  We shouldn't be calling into
                 * random console drivers on a CPU which doesn't exist yet..
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                goto out;
        }
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * We own the drivers.  We can drop the spinlock and let
                 * release_console_sem() print the text
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                console_may_schedule = 0;
                release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * allows the semaphore holder to proceed and to call the
                 * console drivers with the output which we just produced.
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
 out:
+        preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        return ret;
 }
+static int may_attach(struct task_struct *task)
+{
+        if (!task->mm)
+                return -EPERM;
+        if (((current->uid != task->euid) ||
+             (current->uid != task->suid) ||
+             (current->uid != task->uid) ||
+             (current->gid != task->egid) ||
+             (current->gid != task->sgid) ||
+             (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        smp_rmb();
+        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        return security_ptrace(current, task);
+}
+int ptrace_may_attach(struct task_struct *task)
+{
+        int err;
+        task_lock(task);
+        err = may_attach(task);
+        task_unlock(task);
+        return !err;
+}
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
                goto bad;
        if (task == current)
                goto bad;
-        if (!task->mm)
-                goto bad;
-        if(((current->uid != task->euid) ||
-            (current->uid != task->suid) ||
-            (current->uid != task->uid) ||
-            (current->gid != task->egid) ||
-            (current->gid != task->sgid) ||
-            (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-                goto bad;
-        smp_rmb();
-        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
-        retval = security_ptrace(current, task);
+        retval = may_attach(task);
        if (retval)
                goto bad;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..bef3b6901b76 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
+#include <linux/rcuref.h>
 #include <linux/cpu.h>
 /* Definition for rcupdate control block. */
@@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10;
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
+        [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+};
+#endif
 /**
 * call_rcu - Queue an RCU callback for invocation after a grace period.
 * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
 */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-        struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
        if (res) {
-                memset(res, 0, sizeof(*res));
                res->name = name;
                res->start = start;
                res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..81b3a96ed2d0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
 * smp_call_function() if an IPI is sent by the same process we are
 * waiting to become inactive.
 */
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                int local_group;
                int i;
+                /* Skip over this group if it has no CPUs allowed */
+                if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+                        goto nextgroup;
                local_group = cpu_isset(this_cpu, group->cpumask);
-                /* XXX: put a cpus allowed check */
                /* Tally up the load of all CPUs in the group */
                avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
                        min_load = avg_load;
                        idlest = group;
                }
+nextgroup:
                group = group->next;
        } while (group != sd->groups);
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
 /*
 * find_idlest_queue - find the idlest runqueue among the cpus in group.
 */
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
+        cpumask_t tmp;
        unsigned long load, min_load = ULONG_MAX;
        int idlest = -1;
        int i;
-        for_each_cpu_mask(i, group->cpumask) {
+        /* Traverse only the allowed CPUs */
+        cpus_and(tmp, group->cpumask, p->cpus_allowed);
+        for_each_cpu_mask(i, tmp) {
                load = source_load(i, 0);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
                if (!group)
                        goto nextlevel;
-                new_cpu = find_idlest_cpu(group, cpu);
+                new_cpu = find_idlest_cpu(group, t, cpu);
                if (new_cpu == -1 || new_cpu == cpu)
                        goto nextlevel;
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
 *
 * returns failure only if the task is already active.
 */
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 {
        int cpu, this_cpu, success = 0;
        unsigned long flags;
@@ -1252,6 +1261,16 @@ out_activate:
        }
        /*
+         * Tasks that have marked their sleep as noninteractive get
+         * woken up without updating their sleep average. (i.e. their
+         * sleep is handled in a priority-neutral manner, no priority
+         * boost and no penalty.)
+         */
+        if (old_state & TASK_NONINTERACTIVE)
+                __activate_task(p, rq);
+        else
+                activate_task(p, rq, cpu == this_cpu);
+        /*
         * Sync wakeups (i.e. those types of wakeups where the waker
         * has indicated that it will leave the CPU in short order)
         * don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1278,6 @@ out_activate:
         * the waker guarantees that the freshly woken up task is going
         * to be considered on this CPU.)
         */
-        activate_task(p, rq, cpu == this_cpu);
        if (!sync || cpu != this_cpu) {
                if (TASK_PREEMPTS_CURR(p, rq))
                        resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
        return success;
 }
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
 {
        return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
                                 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 {
        unsigned long flags;
        int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 * artificially, because any timeslice recovered here
 * was given away by the parent in the first place.)
 */
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
 {
        unsigned long flags;
        runqueue_t *rq;
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
 /**
 * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+        /* this is a valid case when another task releases the spinlock */
+        rq->lock.owner = current;
+#endif
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-             struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+                     struct sched_domain *sd, enum idle_type idle,
+                     int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
 */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                   unsigned long *imbalance, enum idle_type idle)
+                   unsigned long *imbalance, enum idle_type idle, int *sd_idle)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+        unsigned long max_pull;
        int load_idx;
        max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = 0;
                for_each_cpu_mask(i, group->cpumask) {
+                        if (*sd_idle && !idle_cpu(i))
+                                *sd_idle = 0;
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                group = group->next;
        } while (group != sd->groups);
-        if (!busiest || this_load >= max_load)
+        if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
                goto out_balanced;
        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * by pulling tasks to us.  Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
+        /* Don't want to pull so many tasks that a group would go idle */
+        max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+        *imbalance = min(max_pull * busiest->cpu_power,
                                (avg_load - this_load) * this->cpu_power)
                        / SCHED_LOAD_SCALE;
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        unsigned long imbalance;
        int nr_moved, all_pinned = 0;
        int active_balance = 0;
+        int sd_idle = 0;
+        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+                sd_idle = 1;
-        spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 * still unbalanced. nr_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-                double_lock_balance(this_rq, busiest);
+                double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                                imbalance, sd, idle,
+                                        imbalance, sd, idle, &all_pinned);
-                                                &all_pinned);
+                double_rq_unlock(this_rq, busiest);
-                spin_unlock(&busiest->lock);
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned))
                        goto out_balanced;
        }
-        spin_unlock(&this_rq->lock);
        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[idle]);
                sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
                        spin_lock(&busiest->lock);
+                        /* don't kick the migration_thread, if the curr
+                         * task on busiest cpu can't be moved to this_cpu
+                         */
+                        if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                                spin_unlock(&busiest->lock);
+                                all_pinned = 1;
+                                goto out_one_pinned;
+                        }
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                        sd->balance_interval *= 2;
        }
+        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        return nr_moved;
 out_balanced:
-        spin_unlock(&this_rq->lock);
        schedstat_inc(sd, lb_balanced[idle]);
        sd->nr_balance_failed = 0;
+out_one_pinned:
        /* tune up the balancing interval */
        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        return 0;
 }
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        runqueue_t *busiest = NULL;
        unsigned long imbalance;
        int nr_moved = 0;
+        int sd_idle = 0;
+        if (sd->flags & SD_SHARE_CPUPOWER)
+                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        BUG_ON(busiest == this_rq);
-        /* Attempt to move tasks */
-        double_lock_balance(this_rq, busiest);
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
-        nr_moved = move_tasks(this_rq, this_cpu, busiest,
+        nr_moved = 0;
+        if (busiest->nr_running > 1) {
+                /* Attempt to move tasks */
+                double_lock_balance(this_rq, busiest);
+                nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, NEWLY_IDLE, NULL);
-        if (!nr_moved)
+                spin_unlock(&busiest->lock);
+        }
+        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-        else
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                        return -1;
+        } else
                sd->nr_balance_failed = 0;
-        spin_unlock(&busiest->lock);
        return nr_moved;
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                return -1;
        sd->nr_balance_failed = 0;
        return 0;
 }
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
                if (j - sd->last_balance >= interval) {
                        if (load_balance(this_cpu, this_rq, sd, idle)) {
-                                /* We've pulled tasks over so no longer idle */
+                                /*
+                                 * We've pulled tasks over so either we're no
+                                 * longer idle, or one of our SMT siblings is
+                                 * not idle.
+                                 */
                                idle = NOT_IDLE;
                        }
                        sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
 }
 #ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+        /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+        if (rq->curr == rq->idle && rq->nr_running)
+                resched_task(rq->idle);
+}
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
        for_each_cpu_mask(i, sibling_map) {
                runqueue_t *smt_rq = cpu_rq(i);
-                /*
+                wakeup_busy_runqueue(smt_rq);
-                 * If an SMT sibling task is sleeping due to priority
-                 * reasons wake it up now.
-                 */
-                if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-                        resched_task(smt_rq->idle);
        }
        for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
         */
 }
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+        return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                runqueue_t *smt_rq = cpu_rq(i);
                task_t *smt_curr = smt_rq->curr;
+                /* Kernel threads do not participate in dependent sleeping */
+                if (!p->mm || !smt_curr->mm || rt_task(p))
+                        goto check_smt_task;
                /*
                 * If a user task with lower static priority than the
                 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                 * task from using an unfair proportion of the
                 * physical cpu's resources. -ck
                 */
-                if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
+                if (rt_task(smt_curr)) {
-                        task_timeslice(p) || rt_task(smt_curr)) &&
+                        /*
-                        p->mm && smt_curr->mm && !rt_task(p))
+                         * With real time tasks we run non-rt tasks only
-                                ret = 1;
+                         * per_cpu_gain% of the time.
+                         */
+                        if ((jiffies % DEF_TIMESLICE) >
+                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+                                        ret = 1;
+                } else
+                        if (smt_curr->static_prio < p->static_prio &&
+                                !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                                smt_slice(smt_curr, sd) > task_timeslice(p))
+                                        ret = 1;
+check_smt_task:
+                if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+                        rt_task(smt_curr))
+                                continue;
+                if (!p->mm) {
+                        wakeup_busy_runqueue(smt_rq);
+                        continue;
+                }
                /*
-                 * Reschedule a lower priority task on the SMT sibling,
+                 * Reschedule a lower priority task on the SMT sibling for
-                 * or wake it up if it has been put to sleep for priority
+                 * it to be put to sleep, or wake it up if it has been put to
-                 * reasons.
+                 * sleep for priority reasons to see if it should run now.
                 */
-                if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
+                if (rt_task(p)) {
-                        task_timeslice(smt_curr) || rt_task(p)) &&
+                        if ((jiffies % DEF_TIMESLICE) >
-                        smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
+                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
-                        (smt_curr == smt_rq->idle && smt_rq->nr_running))
+                                        resched_task(smt_curr);
-                                resched_task(smt_curr);
+                } else {
+                        if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                                smt_slice(p, sd) > task_timeslice(smt_curr))
+                                        resched_task(smt_curr);
+                        else
+                                wakeup_busy_runqueue(smt_rq);
+                }
        }
 out_unlock:
        for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
+        prefetch_stack(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));
@@ -3014,7 +3117,8 @@ need_resched:
 #endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+                          void *key)
 {
        task_t *p = curr->private;
        return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @key: is directly passed to the wakeup function
 */
 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
-                                int nr_exclusive, void *key)
+                        int nr_exclusive, void *key)
 {
        unsigned long flags;
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 *
 * On UP it can prevent extra preemption.
 */
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 {
        unsigned long flags;
        int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
        SLEEP_ON_VAR
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
 * @policy: new policy.
 * @param: structure containing the new RT priority.
 */
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+                       struct sched_param *param)
 {
        int retval;
        int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,7 @@ recheck:
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
         */
        if (param->sched_priority < 0 ||
-            (p->mm &&  param->sched_priority > MAX_USER_RT_PRIO-1) ||
+            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3688,8 @@ recheck:
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 {
        int retval;
        struct sched_param lparam;
@@ -3848,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
        if (rt_task(current))
                target = rq->active;
-        if (current->array->nr_active == 1) {
+        if (array->nr_active == 1) {
                schedstat_inc(rq, yld_act_empty);
                if (!rq->expired->nr_active)
                        schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched);
 * operations here to prevent schedule() from being called twice (once via
 * spin_unlock(), once by hand).
 */
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
 {
        int ret = 0;
@@ -4095,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
        return list_entry(p->sibling.next,struct task_struct,sibling);
 }
-static void show_task(task_t * p)
+static void show_task(task_t *p)
 {
        task_t *relative;
        unsigned state;
@@ -4121,7 +4229,7 @@ static void show_task(task_t * p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
        {
-                unsigned long * n = (unsigned long *) (p->thread_info+1);
+                unsigned long *n = (unsigned long *) (p->thread_info+1);
                while (!*n)
                        n++;
                free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4438,7 @@ out:
 * thread migration by bumping thread off CPU then 'pushing' onto
 * another runqueue.
 */
-static int migration_thread(void * data)
+static int migration_thread(void *data)
 {
        runqueue_t *rq;
        int cpu = (long)data;
@@ -4779,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4802,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void init_sched_build_groups(struct sched_group groups[],
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                        cpumask_t span, int (*group_fn)(int cpu))
+                                    int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
+#ifdef CONFIG_NUMA
-extern void build_sched_domains(const cpumask_t *cpu_map);
+/**
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ * find_next_best_node - find the next node to include in a sched_domain
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
+ * @node: node whose sched_domain we're building
-#else
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5072,20 @@ static int cpu_to_phys_group(int cpu)
 }
 #ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
+ * The init_sched_build_groups can't handle what we want to do with node
- * multiple nodes. Make sure the architecture has a proper
+ * groups, so roll our own. Now each node has its own list of groups which
- * siblings map:
+ * gets dynamically allocated.
 */
-static void check_sibling_maps(void)
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
-{
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-        int i, j;
-        for_each_online_cpu(i) {
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-                for_each_cpu_mask(j, cpu_sibling_map[i]) {
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-                        if (cpu_to_node(i) != cpu_to_node(j)) {
-                                printk(KERN_INFO "warning: CPU %d siblings map "
+static int cpu_to_allnodes_group(int cpu)
-                                        "to different node - isolating "
+{
-                                        "them.\n", i);
+        return cpu_to_node(cpu);
-                                cpu_sibling_map[i] = cpumask_of_cpu(i);
-                                break;
-                        }
-                }
-        }
 }
 #endif
@@ -4928,9 +5093,24 @@ static void check_sibling_maps(void)
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+        struct sched_group **sched_group_nodes = NULL;
+        struct sched_group *sched_group_allnodes = NULL;
+        /*
+         * Allocate the per-node list of sched groups
+         */
+        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                           GFP_ATOMIC);
+        if (!sched_group_nodes) {
+                printk(KERN_WARNING "Can not alloc sched group node list\n");
+                return;
+        }
+        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
+                if (cpus_weight(*cpu_map)
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        if (!sched_group_allnodes) {
+                                sched_group_allnodes
+                                        = kmalloc(sizeof(struct sched_group)
+                                                        * MAX_NUMNODES,
+                                                  GFP_KERNEL);
+                                if (!sched_group_allnodes) {
+                                        printk(KERN_WARNING
+                                        "Can not alloc allnodes sched group\n");
+                                        break;
+                                }
+                                sched_group_allnodes_bycpu[i]
+                                                = sched_group_allnodes;
+                        }
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = *cpu_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
                sd = &per_cpu(node_domains, i);
-                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = *cpu_map;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->groups = &sched_group_nodes[group];
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
                p = sd;
@@ -4972,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, *cpu_map,
+        if (sched_group_allnodes)
-                                        &cpu_to_node_group);
+                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                        &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask)) {
+                        sched_group_nodes[i] = NULL;
+                        continue;
+                }
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, *cpu_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, *cpu_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
-                if (i == first_cpu(sd->groups->cpumask)) {
+                sd = &per_cpu(allnodes_domains, i);
-                        /* Only add "power" once for each physical package. */
+                if (sd->groups) {
-                        sd = &per_cpu(node_domains, i);
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                        sd->groups->cpu_power += power;
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
                }
 #endif
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5039,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-        /* Do nothing: everything is statically allocated. */
+#ifdef CONFIG_NUMA
-}
+        int i;
+        int cpu;
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+        for_each_cpu_mask(cpu, *cpu_map) {
+                struct sched_group *sched_group_allnodes
+                        = sched_group_allnodes_bycpu[cpu];
+                struct sched_group **sched_group_nodes
+                        = sched_group_nodes_bycpu[cpu];
+                if (sched_group_allnodes) {
+                        kfree(sched_group_allnodes);
+                        sched_group_allnodes_bycpu[cpu] = NULL;
+                }
+                if (!sched_group_nodes)
+                        continue;
+                for (i = 0; i < MAX_NUMNODES; i++) {
+                        cpumask_t nodemask = node_to_cpumask(i);
+                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                        cpus_and(nodemask, nodemask, *cpu_map);
+                        if (cpus_empty(nodemask))
+                                continue;
+                        if (sg == NULL)
+                                continue;
+                        sg = sg->next;
+next_sg:
+                        oldsg = sg;
+                        sg = sg->next;
+                        kfree(oldsg);
+                        if (oldsg != sched_group_nodes[i])
+                                goto next_sg;
+                }
+                kfree(sched_group_nodes);
+                sched_group_nodes_bycpu[cpu] = NULL;
+        }
+#endif
+}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
@@ -5263,3 +5602,47 @@ void normalize_rt_tasks(void)
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+        return cpu_curr(cpu);
+}
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+        cpu_curr(cpu) = p;
+}
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..b92c3c9f8b9a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 /* forward decl */
 static void do_notify_parent_cldstop(struct task_struct *tsk,
-                                     struct task_struct *parent,
+                                     int to_self,
                                     int why);
 /*
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->group_stop_count = 0;
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_STOPPED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_STOPPED);
                        spin_lock(&p->sighand->siglock);
                }
                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        p->signal->group_exit_code = 0;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_CONTINUED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_CONTINUED);
                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
        unsigned long flags;
        int ret = 0;
-        /*
-         * We need the tasklist lock even for the specific
-         * thread case (when we don't need to follow the group
-         * lists) in order to avoid races with "p->sighand"
-         * going away or changing from under us.
-         */
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-        read_lock(&tasklist_lock);  
+        read_lock(&tasklist_lock);
+        if (unlikely(p->flags & PF_EXITING)) {
+                ret = -1;
+                goto out_err;
+        }
        spin_lock_irqsave(&p->sighand->siglock, flags);
-        
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                        BUG();
                q->info.si_overrun++;
                goto out;
-        } 
+        }
        /* Short-circuit ignored signals.  */
        if (sig_ignored(p, sig)) {
                ret = 1;
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
        read_unlock(&tasklist_lock);
-        return(ret);
+        return ret;
 }
 int
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        spin_unlock_irqrestore(&psig->siglock, flags);
 }
-static void
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
-                         int why)
 {
        struct siginfo info;
        unsigned long flags;
+        struct task_struct *parent;
        struct sighand_struct *sighand;
+        if (to_self)
+                parent = tsk->parent;
+        else {
+                tsk = tsk->group_leader;
+                parent = tsk->real_parent;
+        }
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        info.si_pid = tsk->pid;
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
                   !(current->ptrace & PT_ATTACHED)) &&
            (likely(current->parent->signal != current->signal) ||
             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-                do_notify_parent_cldstop(current, current->parent,
+                do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
-                                         CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
+        int to_self;
        /*
         * If there are no other threads in the group, or if there is
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
+        if (stop_count < 0 || (current->ptrace & PT_PTRACED))
-                read_lock(&tasklist_lock);
+                to_self = 1;
-                do_notify_parent_cldstop(current, current->parent,
+        else if (stop_count == 0)
-                                         CLD_STOPPED);
+                to_self = 0;
-                read_unlock(&tasklist_lock);
+        else
-        }
+                goto out;
-        else if (stop_count == 0) {
-                read_lock(&tasklist_lock);
-                do_notify_parent_cldstop(current->group_leader,
-                                         current->group_leader->real_parent,
-                                         CLD_STOPPED);
-                read_unlock(&tasklist_lock);
-        }
+        read_lock(&tasklist_lock);
+        do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+        read_unlock(&tasklist_lock);
+out:
        schedule();
        /*
         * Now we don't run again until continued.
@@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
                        recalc_sigpending();
                        spin_unlock_irq(&current->sighand->siglock);
-                        current->state = TASK_INTERRUPTIBLE;
+                        timeout = schedule_timeout_interruptible(timeout);
-                        timeout = schedule_timeout(timeout);
                        try_to_freeze();
                        spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
        cpu = smp_processor_id();
 restart:
        /* Reset the pending bitmask before enabling irqs */
-        local_softirq_pending() = 0;
+        set_softirq_pending(0);
        local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+static DEFINE_SPINLOCK(print_lock);
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+                                void *ptr)
+{
+        did_panic = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block panic_block = {
+        .notifier_call = softlock_panic,
+};
+void touch_softlockup_watchdog(void)
+{
+        per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+        int this_cpu = smp_processor_id();
+        unsigned long timestamp = per_cpu(timestamp, this_cpu);
+        if (per_cpu(print_timestamp, this_cpu) == timestamp)
+                return;
+        /* Do not cause a second panic when there already was one */
+        if (did_panic)
+                return;
+        if (time_after(jiffies, timestamp + 10*HZ)) {
+                per_cpu(print_timestamp, this_cpu) = timestamp;
+                spin_lock(&print_lock);
+                printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+                        this_cpu);
+                show_regs(regs);
+                spin_unlock(&print_lock);
+        }
+}
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+        struct sched_param param = { .sched_priority = 99 };
+        int this_cpu = (long) __bind_cpu;
+        printk("softlockup thread %d started up.\n", this_cpu);
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        current->flags |= PF_NOFREEZE;
+        set_current_state(TASK_INTERRUPTIBLE);
+        /*
+         * Run briefly once per second - if this gets delayed for
+         * more than 10 seconds then the debug-printout triggers
+         * in softlockup_tick():
+         */
+        while (!kthread_should_stop()) {
+                msleep_interruptible(1000);
+                touch_softlockup_watchdog();
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int hotcpu = (unsigned long)hcpu;
+        struct task_struct *p;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+                if (IS_ERR(p)) {
+                        printk("watchdog for %i failed\n", hotcpu);
+                        return NOTIFY_BAD;
+                }
+                per_cpu(watchdog_task, hotcpu) = p;
+                kthread_bind(p, hotcpu);
+                break;
+        case CPU_ONLINE:
+                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+                /* Unbind so it can run.  Fall thru. */
+                kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+        case CPU_DEAD:
+                p = per_cpu(watchdog_task, hotcpu);
+                per_cpu(watchdog_task, hotcpu) = NULL;
+                kthread_stop(p);
+                break;
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __devinitdata cpu_nfb = {
+        .notifier_call = cpu_callback
+};
+__init void spawn_softlockup_task(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
+        notifier_chain_register(&panic_notifier_list, &panic_block);
+}
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
 *
 * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
 *
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
 */
 #include <linux/config.h>
@@ -17,12 +20,12 @@
 * Generic declaration of the raw read_trylock() function,
 * architectures are supposed to optimize this:
 */
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
 {
-        _raw_read_lock(lock);
+        __raw_read_lock(lock);
        return 1;
 }
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
 int __lockfunc _spin_trylock(spinlock_t *lock)
 {
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
 }
 EXPORT_SYMBOL(_write_trylock);
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
 void __lockfunc _read_lock(rwlock_t *lock)
 {
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
        local_irq_save(flags);
        preempt_disable();
-        _raw_spin_lock_flags(lock, flags);
+        _raw_spin_lock_flags(lock, &flags);
        return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5)
 {
        long error;
-        int sig;
        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error)
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
        switch (option) {
                case PR_SET_PDEATHSIG:
-                        sig = arg2;
+                        if (!valid_signal(arg2)) {
-                        if (!valid_signal(sig)) {
                                error = -EINVAL;
                                break;
                        }
-                        current->pdeath_signal = sig;
+                        current->pdeath_signal = arg2;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(current->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e0bbee549ea..8e56e2495542 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/net.h>
 #include <linux/sysrq.h>
 #include <linux/highuid.h>
 #include <linux/writeback.h>
@@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header =
 static ctl_table kern_table[];
 static ctl_table vm_table[];
-#ifdef CONFIG_NET
-extern ctl_table net_table[];
-#endif
 static ctl_table proc_table[];
 static ctl_table fs_table[];
 static ctl_table debug_table[];
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..3ba10fa35b60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
        jiffies_64++;
        update_times();
+        softlockup_tick(regs);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 out:
        return timeout < 0 ? 0 : timeout;
 }
 EXPORT_SYMBOL(schedule_timeout);
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+       __set_current_state(TASK_INTERRUPTIBLE);
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+       __set_current_state(TASK_UNINTERRUPTIBLE);
+       return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 /* Thread ID - the internal kernel "pid" */
 asmlinkage long sys_gettid(void)
 {
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
        if (!time_after(expire, now))
                return 0;
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire - now);
-        expire = schedule_timeout(expire - now);
        ret = 0;
        if (expire) {
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
                return -EINVAL;
        expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-        current->state = TASK_INTERRUPTIBLE;
+        expire = schedule_timeout_interruptible(expire);
-        expire = schedule_timeout(expire);
        ret = 0;
        if (expire) {
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
        }
 }
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
 {
        unsigned int src = time_interpolator->source;
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void)
                        now = time_interpolator_get_cycles(src);
                        if (lcycle && time_after(lcycle, now))
                                return lcycle;
+                        /* When holding the xtime write lock, there's no need
+                         * to add the overhead of the cmpxchg.  Readers are
+                         * force to retry until the write lock is released.
+                         */
+                        if (writelock) {
+                                time_interpolator->last_cycle = now;
+                                return now;
+                        }
                        /* Keep track of the last timer value returned. The use of cmpxchg here
                         * will cause contention in an SMP environment.
                         */
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void)
 void time_interpolator_reset(void)
 {
        time_interpolator->offset = 0;
-        time_interpolator->last_counter = time_interpolator_get_counter();
+        time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void)
                return 0;
        return time_interpolator->offset +
-                GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+                GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 #define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec)
         * and the tuning logic insures that.
         */
-        counter = time_interpolator_get_counter();
+        counter = time_interpolator_get_counter(1);
        offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
        if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs)
 {
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-        while (timeout) {
+        while (timeout)
-                set_current_state(TASK_UNINTERRUPTIBLE);
+                timeout = schedule_timeout_uninterruptible(timeout);
-                timeout = schedule_timeout(timeout);
-        }
 }
 EXPORT_SYMBOL(msleep);
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
 {
        unsigned long timeout = msecs_to_jiffies(msecs) + 1;
-        while (timeout && !signal_pending(current)) {
+        while (timeout && !signal_pending(current))
-                set_current_state(TASK_INTERRUPTIBLE);
+                timeout = schedule_timeout_interruptible(timeout);
-                timeout = schedule_timeout(timeout);
-        }
        return jiffies_to_msecs(timeout);
 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
        struct workqueue_struct *wq;
        struct task_struct *p;
-        wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
        if (!wq)
                return NULL;
-        memset(wq, 0, sizeof(*wq));
        wq->name = name;
        /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
-                        if (create_workqueue_thread(wq, hotcpu) < 0) {
+                        if (!create_workqueue_thread(wq, hotcpu)) {
                                printk("workqueue for %i failed\n", hotcpu);
                                return NOTIFY_BAD;
                        }