24 files changed, 887 insertions, 225 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..f70e6027cca9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -220,7 +220,7 @@ asmlinkage long sys_acct(const char __user *name)
                        return (PTR_ERR(tmp));
                }
                /* Difference from BSD - they don't do O_APPEND */
-                file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+                file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
                putname(tmp);
                if (IS_ERR(file)) {
                        return (PTR_ERR(file));
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..1f06e7690106 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -628,13 +628,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
 */
-/*
- * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
- * Disable letting 'cpu_exclusive' cpusets define dynamic sched
- * domains, until the sched domain can handle partial nodes.
- * Remove this #if hackery when sched domains fixed.
- */
-#if 0
 static void update_cpu_domains(struct cpuset *cur)
 {
        struct cpuset *c, *par = cur->parent;
@@ -675,11 +668,6 @@ static void update_cpu_domains(struct cpuset *cur)
        partition_sched_domains(&pspan, &cspan);
        unlock_cpu_hotplug();
 }
-#else
-static void update_cpu_domains(struct cpuset *cur)
-{
-}
-#endif
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
@@ -1611,17 +1599,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
        return 0;
 }
+/*
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
+ */
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+        while (!is_mem_exclusive(cs) && cs->parent)
+                cs = cs->parent;
+        return cs;
+}
 /**
- * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
- * @z: zone in question
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
 *
- * Is zone z allowed in current->mems_allowed, or is
+ * If we're in interrupt, yes, we can always allocate.  If zone
- * the CPU in interrupt context? (zone is always allowed in this case)
+ * z's node is in our tasks mems_allowed, yes.  If it's not a
- */
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
-int cpuset_zone_allowed(struct zone *z)
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set).  That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ *      in_interrupt - any node ok (current task context irrelevant)
+ *      GFP_ATOMIC   - any node ok
+ *      GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
+ *      GFP_USER     - only nodes in current tasks mems allowed ok.
+ **/
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
 {
-        return in_interrupt() ||
+        int node;                       /* node that zone z is on */
-                node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+        const struct cpuset *cs;        /* current cpuset ancestors */
+        int allowed = 1;                /* is allocation in zone z allowed? */
+        if (in_interrupt())
+                return 1;
+        node = z->zone_pgdat->node_id;
+        if (node_isset(node, current->mems_allowed))
+                return 1;
+        if (gfp_mask & __GFP_HARDWALL)  /* If hardwall request, stop here */
+                return 0;
+        /* Not hardwall and node outside mems_allowed: scan up cpusets */
+        down(&cpuset_sem);
+        cs = current->cpuset;
+        if (!cs)
+                goto done;              /* current task exiting */
+        cs = nearest_exclusive_ancestor(cs);
+        allowed = node_isset(node, cs->mems_allowed);
+done:
+        up(&cpuset_sem);
+        return allowed;
+}
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap.  Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
+{
+        const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
+        int overlap = 0;                /* do cpusets overlap? */
+        down(&cpuset_sem);
+        cs1 = current->cpuset;
+        if (!cs1)
+                goto done;              /* current task exiting */
+        cs2 = p->cpuset;
+        if (!cs2)
+                goto done;              /* task p is exiting */
+        cs1 = nearest_exclusive_ancestor(cs1);
+        cs2 = nearest_exclusive_ancestor(cs2);
+        overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+        up(&cpuset_sem);
+        return overlap;
 }
 /*
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <asm/futex.h>
 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
@@ -327,6 +328,118 @@ out:
 }
 /*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+        union futex_key key1, key2;
+        struct futex_hash_bucket *bh1, *bh2;
+        struct list_head *head;
+        struct futex_q *this, *next;
+        int ret, op_ret, attempt = 0;
+retryfull:
+        down_read(&current->mm->mmap_sem);
+        ret = get_futex_key(uaddr1, &key1);
+        if (unlikely(ret != 0))
+                goto out;
+        ret = get_futex_key(uaddr2, &key2);
+        if (unlikely(ret != 0))
+                goto out;
+        bh1 = hash_futex(&key1);
+        bh2 = hash_futex(&key2);
+retry:
+        if (bh1 < bh2)
+                spin_lock(&bh1->lock);
+        spin_lock(&bh2->lock);
+        if (bh1 > bh2)
+                spin_lock(&bh1->lock);
+        op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+        if (unlikely(op_ret < 0)) {
+                int dummy;
+                spin_unlock(&bh1->lock);
+                if (bh1 != bh2)
+                        spin_unlock(&bh2->lock);
+                /* futex_atomic_op_inuser needs to both read and write
+                 * *(int __user *)uaddr2, but we can't modify it
+                 * non-atomically.  Therefore, if get_user below is not
+                 * enough, we need to handle the fault ourselves, while
+                 * still holding the mmap_sem.  */
+                if (attempt++) {
+                        struct vm_area_struct * vma;
+                        struct mm_struct *mm = current->mm;
+                        ret = -EFAULT;
+                        if (attempt >= 2 ||
+                            !(vma = find_vma(mm, uaddr2)) ||
+                            vma->vm_start > uaddr2 ||
+                            !(vma->vm_flags & VM_WRITE))
+                                goto out;
+                        switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+                        case VM_FAULT_MINOR:
+                                current->min_flt++;
+                                break;
+                        case VM_FAULT_MAJOR:
+                                current->maj_flt++;
+                                break;
+                        default:
+                                goto out;
+                        }
+                        goto retry;
+                }
+                /* If we would have faulted, release mmap_sem,
+                 * fault it in and start all over again.  */
+                up_read(&current->mm->mmap_sem);
+                ret = get_user(dummy, (int __user *)uaddr2);
+                if (ret)
+                        return ret;
+                goto retryfull;
+        }
+        head = &bh1->chain;
+        list_for_each_entry_safe(this, next, head, list) {
+                if (match_futex (&this->key, &key1)) {
+                        wake_futex(this);
+                        if (++ret >= nr_wake)
+                                break;
+                }
+        }
+        if (op_ret > 0) {
+                head = &bh2->chain;
+                op_ret = 0;
+                list_for_each_entry_safe(this, next, head, list) {
+                        if (match_futex (&this->key, &key2)) {
+                                wake_futex(this);
+                                if (++op_ret >= nr_wake2)
+                                        break;
+                        }
+                }
+                ret += op_ret;
+        }
+        spin_unlock(&bh1->lock);
+        if (bh1 != bh2)
+                spin_unlock(&bh2->lock);
+out:
+        up_read(&current->mm->mmap_sem);
+        return ret;
+}
+/*
 * Requeue all waiters hashed on one physical page to another
 * physical page.
 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
        filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
        if (signal) {
-                int err;
                err = f_setown(filp, current->pid, 1);
                if (err < 0) {
-                        put_unused_fd(ret);
+                        goto error;
-                        put_filp(filp);
-                        ret = err;
-                        goto out;
                }
                filp->f_owner.signum = signal;
        }
        q = kmalloc(sizeof(*q), GFP_KERNEL);
        if (!q) {
-                put_unused_fd(ret);
+                err = -ENOMEM;
-                put_filp(filp);
+                goto error;
-                ret = -ENOMEM;
-                goto out;
        }
        down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
        if (unlikely(err != 0)) {
                up_read(&current->mm->mmap_sem);
-                put_unused_fd(ret);
-                put_filp(filp);
                kfree(q);
-                return err;
+                goto error;
        }
        /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
        fd_install(ret, filp);
 out:
        return ret;
+error:
+        put_unused_fd(ret);
+        put_filp(filp);
+        ret = err;
+        goto out;
 }
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
        case FUTEX_CMP_REQUEUE:
                ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
                break;
+        case FUTEX_WAKE_OP:
+                ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+                break;
        default:
                ret = -ENOSYS;
        }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
        struct list_head *tmp;
        struct inter_module_entry *ime, *ime_new;
-        if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+        if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
                /* Overloaded kernel, not fatal */
                printk(KERN_ERR
                        "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
                kmalloc_failed = 1;
                return;
        }
-        memset(ime_new, 0, sizeof(*ime_new));
        ime_new->im_name = im_name;
        ime_new->owner = owner;
        ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
        unsigned int status;
        kstat_this_cpu.irqs[irq]++;
-        if (desc->status & IRQ_PER_CPU) {
+        if (CHECK_IRQ_PER_CPU(desc->status)) {
                irqreturn_t action_ret;
                /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
 /**
 *      synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-void __attribute__((weak))
+#ifdef CONFIG_GENERIC_PENDING_IRQ
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+        /*
+         * Save these away for later use. Re-progam when the
+         * interrupt is pending
+         */
+        set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
        irq_affinity[irq] = mask_val;
        irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
                                  int count, int *eof, void *data)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
 * get_insn_slot() - Find a slot on an executable page for an instruction.
 * We allocate an executable page if there's no room on existing ones.
 */
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
        return kip->insns;
 }
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 {
        struct kprobe_insn_page *kip;
        struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
 }
 /* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we take the kprobe_lock
+         * and before we get a chance to update kprobe_cpu, this to prevent
+         * deadlock when we have a kprobe on ISR routine and a kprobe on task
+         * routine
+         */
+        local_irq_save(flags);
        spin_lock(&kprobe_lock);
        kprobe_cpu = smp_processor_id();
+        local_irq_restore(flags);
 }
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
 {
+        unsigned long flags = 0;
+        /* Avoiding local interrupts to happen right after we update
+         * kprobe_cpu and before we get a a chance to release kprobe_lock,
+         * this to prevent deadlock when we have a kprobe on ISR routine and
+         * a kprobe on task routine
+         */
+        local_irq_save(flags);
        kprobe_cpu = NR_CPUS;
        spin_unlock(&kprobe_lock);
+        local_irq_restore(flags);
 }
 /* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
 {
        struct hlist_head *head;
        struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp;
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-                              unsigned long flags)
+                                        unsigned long flags)
 {
        struct kprobe *kp;
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
        return;
 }
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
-                              int trapnr)
+                                        int trapnr)
 {
        /*
         * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
        return 0;
 }
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
        struct kprobe *kp = curr_kprobe;
        if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
        return 0;
 }
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+                                                              *rp)
 {
        struct hlist_node *node;
        struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
        return NULL;
 }
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
        /*
         * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
        hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
        /* remove rp inst off the rprobe_inst_table */
        hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
                kfree(ri);
 }
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
 {
        return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
 * instances associated with this task. These left over instances represent
 * probed functions that have been called but will never return.
 */
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
 {
        struct kretprobe_instance *ri;
        struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
 * This kprobe pre_handler is registered with every kretprobe. When probe
 * hits it will set up the return probe.
 */
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+                                           struct pt_regs *regs)
 {
        struct kretprobe *rp = container_of(p, struct kretprobe, kp);
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 * Add the new probe to old_p->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
 {
        struct kprobe *kp;
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 * the intricacies
 * TODO: Move kcalloc outside the spinlock
 */
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+                                          struct kprobe *p)
 {
        int ret = 0;
        struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
                spin_unlock_irqrestore(&kprobe_lock, flags);
 }
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+        if (addr >= (unsigned long)__kprobes_text_start
+                && addr < (unsigned long)__kprobes_text_end)
+                return -EINVAL;
+        return 0;
+}
+int __kprobes register_kprobe(struct kprobe *p)
 {
        int ret = 0;
        unsigned long flags = 0;
        struct kprobe *old_p;
-        if ((ret = arch_prepare_kprobe(p)) != 0) {
+        if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+                return ret;
+        if ((ret = arch_prepare_kprobe(p)) != 0)
                goto rm_kprobe;
-        }
        spin_lock_irqsave(&kprobe_lock, flags);
        old_p = get_kprobe(p->addr);
        p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
        return ret;
 }
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
 {
        unsigned long flags;
        struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
        .priority = 0x7fffffff /* we need to notified first */
 };
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
 {
        /* Todo: Verify probepoint is a function entry point */
        jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
        return register_kprobe(&jp->kp);
 }
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
 {
        unregister_kprobe(&jp->kp);
 }
 #ifdef ARCH_SUPPORTS_KRETPROBES
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        int ret = 0;
        struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
 #else /* ARCH_SUPPORTS_KRETPROBES */
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
 {
        return -ENOSYS;
 }
 #endif /* ARCH_SUPPORTS_KRETPROBES */
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
        unsigned long flags;
        struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..4b39d3793c72 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1509,6 +1509,7 @@ static struct module *load_module(void __user *umod,
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
        struct exception_table_entry *extable;
+        mm_segment_t old_fs;
        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
               umod, len, uargs);
@@ -1779,6 +1780,24 @@ static struct module *load_module(void __user *umod,
        if (err < 0)
                goto cleanup;
+        /* flush the icache in correct context */
+        old_fs = get_fs();
+        set_fs(KERNEL_DS);
+        /*
+         * Flush the instruction cache, since we've played with text.
+         * Do it before processing of module parameters, so the module
+         * can provide parameter accessor functions of its own.
+         */
+        if (mod->module_init)
+                flush_icache_range((unsigned long)mod->module_init,
+                                   (unsigned long)mod->module_init
+                                   + mod->init_size);
+        flush_icache_range((unsigned long)mod->module_core,
+                           (unsigned long)mod->module_core + mod->core_size);
+        set_fs(old_fs);
        mod->args = args;
        if (obsparmindex) {
                err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1879,6 @@ sys_init_module(void __user *umod,
                const char __user *uargs)
 {
        struct module *mod;
-        mm_segment_t old_fs = get_fs();
        int ret = 0;
        /* Must have permission */
@@ -1878,19 +1896,6 @@ sys_init_module(void __user *umod,
                return PTR_ERR(mod);
        }
-        /* flush the icache in correct context */
-        set_fs(KERNEL_DS);
-        /* Flush the instruction cache, since we've played with text */
-        if (mod->module_init)
-                flush_icache_range((unsigned long)mod->module_init,
-                                   (unsigned long)mod->module_init
-                                   + mod->init_size);
-        flush_icache_range((unsigned long)mod->module_core,
-                           (unsigned long)mod->module_core + mod->core_size);
-        set_fs(old_fs);
        /* Now sew it into the lists.  They won't access us, since
           strong_try_module_get() will fail. */
        stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
 {
        struct module_kobject *mk;
-        mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
+        mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
-        memset(mk, 0, sizeof(struct module_kobject));
+        BUG_ON(!mk);
        mk->mod = THIS_MODULE;
        kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
        timr->sigq->info.si_code = SI_TIMER;
        timr->sigq->info.si_tid = timr->it_id;
        timr->sigq->info.si_value = timr->it_sigev_value;
        if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
-                if (unlikely(timr->it_process->flags & PF_EXITING)) {
+                struct task_struct *leader;
-                        timr->it_sigev_notify = SIGEV_SIGNAL;
+                int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        put_task_struct(timr->it_process);
+                                        timr->it_process);
-                        timr->it_process = timr->it_process->group_leader;
-                        goto group;
+                if (likely(ret >= 0))
-                }
+                        return ret;
-                return send_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
+                timr->it_sigev_notify = SIGEV_SIGNAL;
-        }
+                leader = timr->it_process->group_leader;
-        else {
+                put_task_struct(timr->it_process);
-        group:
+                timr->it_process = leader;
-                return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
-                        timr->it_process);
        }
+        return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+                                   timr->it_process);
 }
 EXPORT_SYMBOL_GPL(posix_timer_event);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 917066a5767c..c14cd9991181 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -28,7 +28,7 @@ config PM_DEBUG
 config SOFTWARE_SUSPEND
        bool "Software Suspend"
-        depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+        depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need APM.
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
                           unsigned long id,
                           pm_callback callback)
 {
-        struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+        struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
        if (dev) {
-                memset(dev, 0, sizeof(*dev));
                dev->type = type;
                dev->id = id;
                dev->callback = callback;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
        BUG_ON(!error);
        restore_processor_state();
        restore_highmem();
+        touch_softlockup_watchdog();
        device_power_up();
        local_irq_enable();
        return error;
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
        return r;
 }
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
        unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        static char printk_buf[1024];
        static int log_level_unknown = 1;
-        if (unlikely(oops_in_progress))
+        preempt_disable();
+        if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+                /* If a crash is occurring during printk() on this CPU,
+                 * make sure we can't deadlock */
                zap_locks();
        /* This stops the holder of console_sem just where we want him */
        spin_lock_irqsave(&logbuf_lock, flags);
+        printk_cpu = smp_processor_id();
        /* Emit the output into the temporary buffer */
        printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * CPU until it is officially up.  We shouldn't be calling into
                 * random console drivers on a CPU which doesn't exist yet..
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                goto out;
        }
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * We own the drivers.  We can drop the spinlock and let
                 * release_console_sem() print the text
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
                console_may_schedule = 0;
                release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * allows the semaphore holder to proceed and to call the
                 * console drivers with the output which we just produced.
                 */
+                printk_cpu = UINT_MAX;
                spin_unlock_irqrestore(&logbuf_lock, flags);
        }
 out:
+        preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
        return ret;
 }
+static int may_attach(struct task_struct *task)
+{
+        if (!task->mm)
+                return -EPERM;
+        if (((current->uid != task->euid) ||
+             (current->uid != task->suid) ||
+             (current->uid != task->uid) ||
+             (current->gid != task->egid) ||
+             (current->gid != task->sgid) ||
+             (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        smp_rmb();
+        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+                return -EPERM;
+        return security_ptrace(current, task);
+}
+int ptrace_may_attach(struct task_struct *task)
+{
+        int err;
+        task_lock(task);
+        err = may_attach(task);
+        task_unlock(task);
+        return !err;
+}
 int ptrace_attach(struct task_struct *task)
 {
        int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
                goto bad;
        if (task == current)
                goto bad;
-        if (!task->mm)
-                goto bad;
-        if(((current->uid != task->euid) ||
-            (current->uid != task->suid) ||
-            (current->uid != task->uid) ||
-            (current->gid != task->egid) ||
-            (current->gid != task->sgid) ||
-            (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-                goto bad;
-        smp_rmb();
-        if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
-                goto bad;
        /* the same process cannot be attached many times */
        if (task->ptrace & PT_PTRACED)
                goto bad;
-        retval = security_ptrace(current, task);
+        retval = may_attach(task);
        if (retval)
                goto bad;
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
 */
 struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
 {
-        struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+        struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
        if (res) {
-                memset(res, 0, sizeof(*res));
                res->name = name;
                res->start = start;
                res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index f41fa94d2070..18b95520a2e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4780,7 +4780,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
        runqueue_t *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
@@ -4803,7 +4803,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4831,8 +4831,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void init_sched_build_groups(struct sched_group groups[],
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
-                        cpumask_t span, int (*group_fn)(int cpu))
+                                    int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
@@ -4865,12 +4865,85 @@ void init_sched_build_groups(struct sched_group groups[],
        last->next = first;
 }
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
+#ifdef CONFIG_NUMA
-extern void build_sched_domains(const cpumask_t *cpu_map);
+/**
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+ * find_next_best_node - find the next node to include in a sched_domain
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
+ * @node: node whose sched_domain we're building
-#else
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+        int i, n, val, min_val, best_node = 0;
+        min_val = INT_MAX;
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Start at @node */
+                n = (node + i) % MAX_NUMNODES;
+                if (!nr_cpus_node(n))
+                        continue;
+                /* Skip already used nodes */
+                if (test_bit(n, used_nodes))
+                        continue;
+                /* Simple min distance search */
+                val = node_distance(node, n);
+                if (val < min_val) {
+                        min_val = val;
+                        best_node = n;
+                }
+        }
+        set_bit(best_node, used_nodes);
+        return best_node;
+}
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+        int i;
+        cpumask_t span, nodemask;
+        DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+        cpus_clear(span);
+        bitmap_zero(used_nodes, MAX_NUMNODES);
+        nodemask = node_to_cpumask(node);
+        cpus_or(span, span, nodemask);
+        set_bit(node, used_nodes);
+        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+                int next_node = find_next_best_node(node, used_nodes);
+                nodemask = node_to_cpumask(next_node);
+                cpus_or(span, span, nodemask);
+        }
+        return span;
+}
+#endif
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4892,36 +4965,20 @@ static int cpu_to_phys_group(int cpu)
 }
 #ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-        return cpu_to_node(cpu);
-}
-#endif
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
+ * The init_sched_build_groups can't handle what we want to do with node
- * multiple nodes. Make sure the architecture has a proper
+ * groups, so roll our own. Now each node has its own list of groups which
- * siblings map:
+ * gets dynamically allocated.
 */
-static void check_sibling_maps(void)
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
-{
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
-        int i, j;
-        for_each_online_cpu(i) {
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-                for_each_cpu_mask(j, cpu_sibling_map[i]) {
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-                        if (cpu_to_node(i) != cpu_to_node(j)) {
-                                printk(KERN_INFO "warning: CPU %d siblings map "
+static int cpu_to_allnodes_group(int cpu)
-                                        "to different node - isolating "
+{
-                                        "them.\n", i);
+        return cpu_to_node(cpu);
-                                cpu_sibling_map[i] = cpumask_of_cpu(i);
-                                break;
-                        }
-                }
-        }
 }
 #endif
@@ -4929,9 +4986,24 @@ static void check_sibling_maps(void)
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
+#ifdef CONFIG_NUMA
+        struct sched_group **sched_group_nodes = NULL;
+        struct sched_group *sched_group_allnodes = NULL;
+        /*
+         * Allocate the per-node list of sched groups
+         */
+        sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                           GFP_ATOMIC);
+        if (!sched_group_nodes) {
+                printk(KERN_WARNING "Can not alloc sched group node list\n");
+                return;
+        }
+        sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
        /*
         * Set up domains for cpus specified by the cpu_map.
@@ -4944,11 +5016,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
+                if (cpus_weight(*cpu_map)
+                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+                        if (!sched_group_allnodes) {
+                                sched_group_allnodes
+                                        = kmalloc(sizeof(struct sched_group)
+                                                        * MAX_NUMNODES,
+                                                  GFP_KERNEL);
+                                if (!sched_group_allnodes) {
+                                        printk(KERN_WARNING
+                                        "Can not alloc allnodes sched group\n");
+                                        break;
+                                }
+                                sched_group_allnodes_bycpu[i]
+                                                = sched_group_allnodes;
+                        }
+                        sd = &per_cpu(allnodes_domains, i);
+                        *sd = SD_ALLNODES_INIT;
+                        sd->span = *cpu_map;
+                        group = cpu_to_allnodes_group(i);
+                        sd->groups = &sched_group_allnodes[group];
+                        p = sd;
+                } else
+                        p = NULL;
                sd = &per_cpu(node_domains, i);
-                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = *cpu_map;
+                sd->span = sched_domain_node_span(cpu_to_node(i));
-                sd->groups = &sched_group_nodes[group];
+                sd->parent = p;
+                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
                p = sd;
@@ -4973,7 +5069,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_SCHED_SMT
        /* Set up CPU (sibling) groups */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
@@ -4998,8 +5094,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, *cpu_map,
+        if (sched_group_allnodes)
-                                        &cpu_to_node_group);
+                init_sched_build_groups(sched_group_allnodes, *cpu_map,
+                                        &cpu_to_allnodes_group);
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                /* Set up node groups */
+                struct sched_group *sg, *prev;
+                cpumask_t nodemask = node_to_cpumask(i);
+                cpumask_t domainspan;
+                cpumask_t covered = CPU_MASK_NONE;
+                int j;
+                cpus_and(nodemask, nodemask, *cpu_map);
+                if (cpus_empty(nodemask)) {
+                        sched_group_nodes[i] = NULL;
+                        continue;
+                }
+                domainspan = sched_domain_node_span(i);
+                cpus_and(domainspan, domainspan, *cpu_map);
+                sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                sched_group_nodes[i] = sg;
+                for_each_cpu_mask(j, nodemask) {
+                        struct sched_domain *sd;
+                        sd = &per_cpu(node_domains, j);
+                        sd->groups = sg;
+                        if (sd->groups == NULL) {
+                                /* Turn off balancing if we have no groups */
+                                sd->flags = 0;
+                        }
+                }
+                if (!sg) {
+                        printk(KERN_WARNING
+                        "Can not alloc domain group for node %d\n", i);
+                        continue;
+                }
+                sg->cpu_power = 0;
+                sg->cpumask = nodemask;
+                cpus_or(covered, covered, nodemask);
+                prev = sg;
+                for (j = 0; j < MAX_NUMNODES; j++) {
+                        cpumask_t tmp, notcovered;
+                        int n = (i + j) % MAX_NUMNODES;
+                        cpus_complement(notcovered, covered);
+                        cpus_and(tmp, notcovered, *cpu_map);
+                        cpus_and(tmp, tmp, domainspan);
+                        if (cpus_empty(tmp))
+                                break;
+                        nodemask = node_to_cpumask(n);
+                        cpus_and(tmp, tmp, nodemask);
+                        if (cpus_empty(tmp))
+                                continue;
+                        sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                        if (!sg) {
+                                printk(KERN_WARNING
+                                "Can not alloc domain group for node %d\n", j);
+                                break;
+                        }
+                        sg->cpu_power = 0;
+                        sg->cpumask = tmp;
+                        cpus_or(covered, covered, tmp);
+                        prev->next = sg;
+                        prev = sg;
+                }
+                prev->next = sched_group_nodes[i];
+        }
 #endif
        /* Calculate CPU power for physical packages and nodes */
@@ -5018,14 +5183,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
                sd->groups->cpu_power = power;
 #ifdef CONFIG_NUMA
-                if (i == first_cpu(sd->groups->cpumask)) {
+                sd = &per_cpu(allnodes_domains, i);
-                        /* Only add "power" once for each physical package. */
+                if (sd->groups) {
-                        sd = &per_cpu(node_domains, i);
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                        sd->groups->cpu_power += power;
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sd->groups->cpu_power = power;
                }
 #endif
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < MAX_NUMNODES; i++) {
+                struct sched_group *sg = sched_group_nodes[i];
+                int j;
+                if (sg == NULL)
+                        continue;
+next_sg:
+                for_each_cpu_mask(j, sg->cpumask) {
+                        struct sched_domain *sd;
+                        int power;
+                        sd = &per_cpu(phys_domains, j);
+                        if (j != first_cpu(sd->groups->cpumask)) {
+                                /*
+                                 * Only add "power" once for each
+                                 * physical package.
+                                 */
+                                continue;
+                        }
+                        power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+                                (cpus_weight(sd->groups->cpumask)-1) / 10;
+                        sg->cpu_power += power;
+                }
+                sg = sg->next;
+                if (sg != sched_group_nodes[i])
+                        goto next_sg;
+        }
+#endif
        /* Attach the domains */
        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
@@ -5040,13 +5237,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
 */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
        /*
         * Setup mask for cpus without special case scheduling requirements.
         * For now this just excludes isolated cpus, but could be used to
@@ -5059,10 +5253,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-        /* Do nothing: everything is statically allocated. */
+#ifdef CONFIG_NUMA
-}
+        int i;
+        int cpu;
+        for_each_cpu_mask(cpu, *cpu_map) {
+                struct sched_group *sched_group_allnodes
+                        = sched_group_allnodes_bycpu[cpu];
+                struct sched_group **sched_group_nodes
+                        = sched_group_nodes_bycpu[cpu];
+                if (sched_group_allnodes) {
+                        kfree(sched_group_allnodes);
+                        sched_group_allnodes_bycpu[cpu] = NULL;
+                }
+                if (!sched_group_nodes)
+                        continue;
+                for (i = 0; i < MAX_NUMNODES; i++) {
+                        cpumask_t nodemask = node_to_cpumask(i);
+                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+                        cpus_and(nodemask, nodemask, *cpu_map);
+                        if (cpus_empty(nodemask))
+                                continue;
+                        if (sg == NULL)
+                                continue;
+                        sg = sg->next;
+next_sg:
+                        oldsg = sg;
+                        sg = sg->next;
+                        kfree(oldsg);
+                        if (oldsg != sched_group_nodes[i])
+                                goto next_sg;
+                }
+                kfree(sched_group_nodes);
+                sched_group_nodes_bycpu[cpu] = NULL;
+        }
+#endif
+}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..4980a073237f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 /* forward decl */
 static void do_notify_parent_cldstop(struct task_struct *tsk,
-                                     struct task_struct *parent,
+                                     int to_self,
                                     int why);
 /*
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->group_stop_count = 0;
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_STOPPED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_STOPPED);
                        spin_lock(&p->sighand->siglock);
                }
                rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
                        p->signal->flags = SIGNAL_STOP_CONTINUED;
                        p->signal->group_exit_code = 0;
                        spin_unlock(&p->sighand->siglock);
-                        if (p->ptrace & PT_PTRACED)
+                        do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
-                                do_notify_parent_cldstop(p, p->parent,
-                                                         CLD_CONTINUED);
-                        else
-                                do_notify_parent_cldstop(
-                                        p->group_leader,
-                                        p->group_leader->real_parent,
-                                                         CLD_CONTINUED);
                        spin_lock(&p->sighand->siglock);
                } else {
                        /*
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
        unsigned long flags;
        int ret = 0;
-        /*
-         * We need the tasklist lock even for the specific
-         * thread case (when we don't need to follow the group
-         * lists) in order to avoid races with "p->sighand"
-         * going away or changing from under us.
-         */
        BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-        read_lock(&tasklist_lock);  
+        read_lock(&tasklist_lock);
+        if (unlikely(p->flags & PF_EXITING)) {
+                ret = -1;
+                goto out_err;
+        }
        spin_lock_irqsave(&p->sighand->siglock, flags);
-        
        if (unlikely(!list_empty(&q->list))) {
                /*
                 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
                        BUG();
                q->info.si_overrun++;
                goto out;
-        } 
+        }
        /* Short-circuit ignored signals.  */
        if (sig_ignored(p, sig)) {
                ret = 1;
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 out:
        spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
        read_unlock(&tasklist_lock);
-        return(ret);
+        return ret;
 }
 int
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
        spin_unlock_irqrestore(&psig->siglock, flags);
 }
-static void
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
-                         int why)
 {
        struct siginfo info;
        unsigned long flags;
+        struct task_struct *parent;
        struct sighand_struct *sighand;
+        if (to_self)
+                parent = tsk->parent;
+        else {
+                tsk = tsk->group_leader;
+                parent = tsk->real_parent;
+        }
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        info.si_pid = tsk->pid;
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
                   !(current->ptrace & PT_ATTACHED)) &&
            (likely(current->parent->signal != current->signal) ||
             !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
-                do_notify_parent_cldstop(current, current->parent,
+                do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
-                                         CLD_TRAPPED);
                read_unlock(&tasklist_lock);
                schedule();
        } else {
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code)
 static void
 finish_stop(int stop_count)
 {
+        int to_self;
        /*
         * If there are no other threads in the group, or if there is
         * a group stop in progress and we are the last to stop,
         * report to the parent.  When ptraced, every thread reports itself.
         */
-        if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
+        if (stop_count < 0 || (current->ptrace & PT_PTRACED))
-                read_lock(&tasklist_lock);
+                to_self = 1;
-                do_notify_parent_cldstop(current, current->parent,
+        else if (stop_count == 0)
-                                         CLD_STOPPED);
+                to_self = 0;
-                read_unlock(&tasklist_lock);
+        else
-        }
+                goto out;
-        else if (stop_count == 0) {
-                read_lock(&tasklist_lock);
-                do_notify_parent_cldstop(current->group_leader,
-                                         current->group_leader->real_parent,
-                                         CLD_STOPPED);
-                read_unlock(&tasklist_lock);
-        }
+        read_lock(&tasklist_lock);
+        do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+        read_unlock(&tasklist_lock);
+out:
        schedule();
        /*
         * Now we don't run again until continued.
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+static DEFINE_SPINLOCK(print_lock);
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+                                void *ptr)
+{
+        did_panic = 1;
+        return NOTIFY_DONE;
+}
+static struct notifier_block panic_block = {
+        .notifier_call = softlock_panic,
+};
+void touch_softlockup_watchdog(void)
+{
+        per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+        int this_cpu = smp_processor_id();
+        unsigned long timestamp = per_cpu(timestamp, this_cpu);
+        if (per_cpu(print_timestamp, this_cpu) == timestamp)
+                return;
+        /* Do not cause a second panic when there already was one */
+        if (did_panic)
+                return;
+        if (time_after(jiffies, timestamp + 10*HZ)) {
+                per_cpu(print_timestamp, this_cpu) = timestamp;
+                spin_lock(&print_lock);
+                printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+                        this_cpu);
+                show_regs(regs);
+                spin_unlock(&print_lock);
+        }
+}
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+        struct sched_param param = { .sched_priority = 99 };
+        int this_cpu = (long) __bind_cpu;
+        printk("softlockup thread %d started up.\n", this_cpu);
+        sched_setscheduler(current, SCHED_FIFO, &param);
+        current->flags |= PF_NOFREEZE;
+        set_current_state(TASK_INTERRUPTIBLE);
+        /*
+         * Run briefly once per second - if this gets delayed for
+         * more than 10 seconds then the debug-printout triggers
+         * in softlockup_tick():
+         */
+        while (!kthread_should_stop()) {
+                msleep_interruptible(1000);
+                touch_softlockup_watchdog();
+        }
+        __set_current_state(TASK_RUNNING);
+        return 0;
+}
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int hotcpu = (unsigned long)hcpu;
+        struct task_struct *p;
+        switch (action) {
+        case CPU_UP_PREPARE:
+                BUG_ON(per_cpu(watchdog_task, hotcpu));
+                p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+                if (IS_ERR(p)) {
+                        printk("watchdog for %i failed\n", hotcpu);
+                        return NOTIFY_BAD;
+                }
+                per_cpu(watchdog_task, hotcpu) = p;
+                kthread_bind(p, hotcpu);
+                break;
+        case CPU_ONLINE:
+                wake_up_process(per_cpu(watchdog_task, hotcpu));
+                break;
+#ifdef CONFIG_HOTPLUG_CPU
+        case CPU_UP_CANCELED:
+                /* Unbind so it can run.  Fall thru. */
+                kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+        case CPU_DEAD:
+                p = per_cpu(watchdog_task, hotcpu);
+                per_cpu(watchdog_task, hotcpu) = NULL;
+                kthread_stop(p);
+                break;
+#endif /* CONFIG_HOTPLUG_CPU */
+        }
+        return NOTIFY_OK;
+}
+static struct notifier_block __devinitdata cpu_nfb = {
+        .notifier_call = cpu_callback
+};
+__init void spawn_softlockup_task(void)
+{
+        void *cpu = (void *)(long)smp_processor_id();
+        cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+        cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+        register_cpu_notifier(&cpu_nfb);
+        notifier_chain_register(&panic_notifier_list, &panic_block);
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5)
 {
        long error;
-        int sig;
        error = security_task_prctl(option, arg2, arg3, arg4, arg5);
        if (error)
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
        switch (option) {
                case PR_SET_PDEATHSIG:
-                        sig = arg2;
+                        if (!valid_signal(arg2)) {
-                        if (!valid_signal(sig)) {
                                error = -EINVAL;
                                break;
                        }
-                        current->pdeath_signal = sig;
+                        current->pdeath_signal = arg2;
                        break;
                case PR_GET_PDEATHSIG:
                        error = put_user(current->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..13e2b513be01 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
        jiffies_64++;
        update_times();
+        softlockup_tick(regs);
 }
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1428,7 +1429,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
        }
 }
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
 {
        unsigned int src = time_interpolator->source;
@@ -1442,6 +1443,15 @@ static inline u64 time_interpolator_get_counter(void)
                        now = time_interpolator_get_cycles(src);
                        if (lcycle && time_after(lcycle, now))
                                return lcycle;
+                        /* When holding the xtime write lock, there's no need
+                         * to add the overhead of the cmpxchg.  Readers are
+                         * force to retry until the write lock is released.
+                         */
+                        if (writelock) {
+                                time_interpolator->last_cycle = now;
+                                return now;
+                        }
                        /* Keep track of the last timer value returned. The use of cmpxchg here
                         * will cause contention in an SMP environment.
                         */
@@ -1455,7 +1465,7 @@ static inline u64 time_interpolator_get_counter(void)
 void time_interpolator_reset(void)
 {
        time_interpolator->offset = 0;
-        time_interpolator->last_counter = time_interpolator_get_counter();
+        time_interpolator->last_counter = time_interpolator_get_counter(1);
 }
 #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1477,7 @@ unsigned long time_interpolator_get_offset(void)
                return 0;
        return time_interpolator->offset +
-                GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+                GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
 }
 #define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1500,7 @@ static void time_interpolator_update(long delta_nsec)
         * and the tuning logic insures that.
         */
-        counter = time_interpolator_get_counter();
+        counter = time_interpolator_get_counter(1);
        offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
        if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
        struct workqueue_struct *wq;
        struct task_struct *p;
-        wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+        wq = kzalloc(sizeof(*wq), GFP_KERNEL);
        if (!wq)
                return NULL;
-        memset(wq, 0, sizeof(*wq));
        wq->name = name;
        /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
        case CPU_UP_PREPARE:
                /* Create a new workqueue thread for it. */
                list_for_each_entry(wq, &workqueues, list) {
-                        if (create_workqueue_thread(wq, hotcpu) < 0) {
+                        if (!create_workqueue_thread(wq, hotcpu)) {
                                printk("workqueue for %i failed\n", hotcpu);
                                return NOTIFY_BAD;
                        }