24 files changed, 2103 insertions, 600 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
+choice
+        prompt "Preemption Model"
+        default PREEMPT_NONE
+config PREEMPT_NONE
+        bool "No Forced Preemption (Server)"
+        help
+          This is the traditional Linux preemption model, geared towards
+          throughput. It will still provide good latencies most of the
+          time, but there are no guarantees and occasional longer delays
+          are possible.
+          Select this option if you are building a kernel for a server or
+          scientific/computation system, or if you want to maximize the
+          raw processing power of the kernel, irrespective of scheduling
+          latencies.
+config PREEMPT_VOLUNTARY
+        bool "Voluntary Kernel Preemption (Desktop)"
+        help
+          This option reduces the latency of the kernel by adding more
+          "explicit preemption points" to the kernel code. These new
+          preemption points have been selected to reduce the maximum
+          latency of rescheduling, providing faster application reactions,
+          at the cost of slighly lower throughput.
+          This allows reaction to interactive events by allowing a
+          low priority process to voluntarily preempt itself even if it
+          is in kernel mode executing a system call. This allows
+          applications to run more 'smoothly' even when the system is
+          under load.
+          Select this if you are building a kernel for a desktop system.
+config PREEMPT
+        bool "Preemptible Kernel (Low-Latency Desktop)"
+        help
+          This option reduces the latency of the kernel by making
+          all kernel code (that is not executing in a critical section)
+          preemptible.  This allows reaction to interactive events by
+          permitting a low priority process to be preempted involuntarily
+          even if it is in kernel mode executing a system call and would
+          otherwise not be about to reach a natural preemption point.
+          This allows applications to run more 'smoothly' even when the
+          system is under load, at the cost of slighly lower throughput
+          and a slight runtime overhead to kernel code.
+          Select this if you are building a kernel for a desktop or
+          embedded system with latency requirements in the milliseconds
+          range.
+endchoice
+config PREEMPT_BKL
+        bool "Preempt The Big Kernel Lock"
+        depends on SMP || PREEMPT
+        default y
+        help
+          This option reduces the latency of the kernel by making the
+          big kernel lock preemptible.
+          Say Y here if you are building a kernel for a desktop system.
+          Say N if you are unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
 {
        int err;
-        /* Take offline: makes arch_cpu_down somewhat easier. */
-        cpu_clear(smp_processor_id(), cpu_online_map);
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
-                cpu_set(smp_processor_id(), cpu_online_map);
+                return err;
-        else
-                /* Force idle task to run as soon as we yield: it should
-                   immediately notice cpu is offline and die quickly. */
-                sched_idle_next();
-        return err;
+        /* Force idle task to run as soon as we yield: it should
+           immediately notice cpu is offline and die quickly. */
+        sched_idle_next();
+        return 0;
 }
 int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79dd929f4084..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        return 0;
 }
+/*
+ * For a given cpuset cur, partition the system as follows
+ * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * b. All cpus in the current cpuset's cpus_allowed that are not part of any
+ *    exclusive child cpusets
+ * Build these two partitions by calling partition_sched_domains
+ *
+ * Call with cpuset_sem held.  May nest a call to the
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ */
+static void update_cpu_domains(struct cpuset *cur)
+{
+        struct cpuset *c, *par = cur->parent;
+        cpumask_t pspan, cspan;
+        if (par == NULL || cpus_empty(cur->cpus_allowed))
+                return;
+        /*
+         * Get all cpus from parent's cpus_allowed not part of exclusive
+         * children
+         */
+        pspan = par->cpus_allowed;
+        list_for_each_entry(c, &par->children, sibling) {
+                if (is_cpu_exclusive(c))
+                        cpus_andnot(pspan, pspan, c->cpus_allowed);
+        }
+        if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+                cpus_or(pspan, pspan, cur->cpus_allowed);
+                if (cpus_equal(pspan, cur->cpus_allowed))
+                        return;
+                cspan = CPU_MASK_NONE;
+        } else {
+                if (cpus_empty(pspan))
+                        return;
+                cspan = cur->cpus_allowed;
+                /*
+                 * Get all cpus from current cpuset's cpus_allowed not part
+                 * of exclusive children
+                 */
+                list_for_each_entry(c, &cur->children, sibling) {
+                        if (is_cpu_exclusive(c))
+                                cpus_andnot(cspan, cspan, c->cpus_allowed);
+                }
+        }
+        lock_cpu_hotplug();
+        partition_sched_domains(&pspan, &cspan);
+        unlock_cpu_hotplug();
+}
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
-        int retval;
+        int retval, cpus_unchanged;
        trialcs = *cs;
        retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (cpus_empty(trialcs.cpus_allowed))
                return -ENOSPC;
        retval = validate_change(cs, &trialcs);
-        if (retval == 0)
+        if (retval < 0)
-                cs->cpus_allowed = trialcs.cpus_allowed;
+                return retval;
-        return retval;
+        cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+        cs->cpus_allowed = trialcs.cpus_allowed;
+        if (is_cpu_exclusive(cs) && !cpus_unchanged)
+                update_cpu_domains(cs);
+        return 0;
 }
 static int update_nodemask(struct cpuset *cs, char *buf)
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 {
        int turning_on;
        struct cpuset trialcs;
-        int err;
+        int err, cpu_exclusive_changed;
        turning_on = (simple_strtoul(buf, NULL, 10) != 0);
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
                clear_bit(bit, &trialcs.flags);
        err = validate_change(cs, &trialcs);
-        if (err == 0) {
+        if (err < 0)
-                if (turning_on)
+                return err;
-                        set_bit(bit, &cs->flags);
+        cpu_exclusive_changed =
-                else
+                (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
-                        clear_bit(bit, &cs->flags);
+        if (turning_on)
-        }
+                set_bit(bit, &cs->flags);
-        return err;
+        else
+                clear_bit(bit, &cs->flags);
+        if (cpu_exclusive_changed)
+                update_cpu_domains(cs);
+        return 0;
 }
 static int attach_task(struct cpuset *cs, char *buf)
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
                up(&cpuset_sem);
                return -EBUSY;
        }
-        spin_lock(&cs->dentry->d_lock);
        parent = cs->parent;
        set_bit(CS_REMOVED, &cs->flags);
+        if (is_cpu_exclusive(cs))
+                update_cpu_domains(cs);
        list_del(&cs->sibling); /* delete my sibling from parent->children */
        if (list_empty(&parent->children))
                check_for_release(parent);
+        spin_lock(&cs->dentry->d_lock);
        d = dget(cs->dentry);
        cs->dentry = NULL;
        spin_unlock(&d->d_lock);
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..459ba49e376a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,52 @@
+/*
+ *      kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *      Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *      Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+/*
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+                                size_t csize, unsigned long offset, int userbuf)
+{
+        void *page, *vaddr;
+        if (!csize)
+                return 0;
+        page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+        if (!page)
+                return -ENOMEM;
+        vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+        copy_page(page, vaddr);
+        kunmap_atomic(vaddr, KM_PTE0);
+        if (userbuf) {
+                if (copy_to_user(buf, (page + offset), csize)) {
+                        kfree(page);
+                        return -EFAULT;
+                }
+        } else {
+                memcpy(buf, (page + offset), csize);
+        }
+        kfree(page);
+        return csize;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index a28d11e10877..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1003,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
        p->pdeath_signal = 0;
        p->exit_state = 0;
-        /* Perform scheduler related setup */
-        sched_fork(p);
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
@@ -1014,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->ptrace_children);
        INIT_LIST_HEAD(&p->ptrace_list);
+        /* Perform scheduler related setup. Assign this task to a CPU. */
+        sched_fork(p, clone_flags);
        /* Need tasklist lock for parent etc handling! */
        write_lock_irq(&tasklist_lock);
        /*
-         * The task hasn't been attached yet, so cpus_allowed mask cannot
+         * The task hasn't been attached yet, so its cpus_allowed mask will
-         * have changed. The cpus_allowed mask of the parent may have
+         * not be changed, nor will its assigned CPU.
-         * changed after it was copied first time, and it may then move to
+         *
-         * another CPU - so we re-copy it here and set the child's CPU to
+         * The cpus_allowed mask of the parent may have changed after it was
-         * the parent's CPU. This avoids alot of nasty races.
+         * copied first time - so re-copy it here, then check the child's CPU
+         * to ensure it is on a valid CPU (and if not, just force it back to
+         * parent's CPU). This avoids alot of nasty races.
         */
        p->cpus_allowed = current->cpus_allowed;
-        set_task_cpu(p, smp_processor_id());
+        if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+                set_task_cpu(p, smp_processor_id());
        /*
         * Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..7843548cf2d9
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/semaphore.h>
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+        .name  = "Crash kernel",
+        .start = 0,
+        .end   = 0,
+        .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+int kexec_should_crash(struct task_struct *p)
+{
+        if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+                return 1;
+        return 0;
+}
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+static int kimage_is_destination_range(struct kimage *image,
+                                       unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+                                       unsigned int gfp_mask,
+                                       unsigned long dest);
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+                            unsigned long nr_segments,
+                            struct kexec_segment __user *segments)
+{
+        size_t segment_bytes;
+        struct kimage *image;
+        unsigned long i;
+        int result;
+        /* Allocate a controlling structure */
+        result = -ENOMEM;
+        image = kmalloc(sizeof(*image), GFP_KERNEL);
+        if (!image)
+                goto out;
+        memset(image, 0, sizeof(*image));
+        image->head = 0;
+        image->entry = &image->head;
+        image->last_entry = &image->head;
+        image->control_page = ~0; /* By default this does not apply */
+        image->start = entry;
+        image->type = KEXEC_TYPE_DEFAULT;
+        /* Initialize the list of control pages */
+        INIT_LIST_HEAD(&image->control_pages);
+        /* Initialize the list of destination pages */
+        INIT_LIST_HEAD(&image->dest_pages);
+        /* Initialize the list of unuseable pages */
+        INIT_LIST_HEAD(&image->unuseable_pages);
+        /* Read in the segments */
+        image->nr_segments = nr_segments;
+        segment_bytes = nr_segments * sizeof(*segments);
+        result = copy_from_user(image->segment, segments, segment_bytes);
+        if (result)
+                goto out;
+        /*
+         * Verify we have good destination addresses.  The caller is
+         * responsible for making certain we don't attempt to load
+         * the new image into invalid or reserved areas of RAM.  This
+         * just verifies it is an address we can use.
+         *
+         * Since the kernel does everything in page size chunks ensure
+         * the destination addreses are page aligned.  Too many
+         * special cases crop of when we don't do this.  The most
+         * insidious is getting overlapping destination addresses
+         * simply because addresses are changed to page size
+         * granularity.
+         */
+        result = -EADDRNOTAVAIL;
+        for (i = 0; i < nr_segments; i++) {
+                unsigned long mstart, mend;
+                mstart = image->segment[i].mem;
+                mend   = mstart + image->segment[i].memsz;
+                if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+                        goto out;
+                if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+                        goto out;
+        }
+        /* Verify our destination addresses do not overlap.
+         * If we alloed overlapping destination addresses
+         * through very weird things can happen with no
+         * easy explanation as one segment stops on another.
+         */
+        result = -EINVAL;
+        for (i = 0; i < nr_segments; i++) {
+                unsigned long mstart, mend;
+                unsigned long j;
+                mstart = image->segment[i].mem;
+                mend   = mstart + image->segment[i].memsz;
+                for (j = 0; j < i; j++) {
+                        unsigned long pstart, pend;
+                        pstart = image->segment[j].mem;
+                        pend   = pstart + image->segment[j].memsz;
+                        /* Do the segments overlap ? */
+                        if ((mend > pstart) && (mstart < pend))
+                                goto out;
+                }
+        }
+        /* Ensure our buffer sizes are strictly less than
+         * our memory sizes.  This should always be the case,
+         * and it is easier to check up front than to be surprised
+         * later on.
+         */
+        result = -EINVAL;
+        for (i = 0; i < nr_segments; i++) {
+                if (image->segment[i].bufsz > image->segment[i].memsz)
+                        goto out;
+        }
+        result = 0;
+out:
+        if (result == 0)
+                *rimage = image;
+        else
+                kfree(image);
+        return result;
+}
+static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+                                unsigned long nr_segments,
+                                struct kexec_segment __user *segments)
+{
+        int result;
+        struct kimage *image;
+        /* Allocate and initialize a controlling structure */
+        image = NULL;
+        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (result)
+                goto out;
+        *rimage = image;
+        /*
+         * Find a location for the control code buffer, and add it
+         * the vector of segments so that it's pages will also be
+         * counted as destination pages.
+         */
+        result = -ENOMEM;
+        image->control_code_page = kimage_alloc_control_pages(image,
+                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+        if (!image->control_code_page) {
+                printk(KERN_ERR "Could not allocate control_code_buffer\n");
+                goto out;
+        }
+        result = 0;
+ out:
+        if (result == 0)
+                *rimage = image;
+        else
+                kfree(image);
+        return result;
+}
+static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+                                unsigned long nr_segments,
+                                struct kexec_segment *segments)
+{
+        int result;
+        struct kimage *image;
+        unsigned long i;
+        image = NULL;
+        /* Verify we have a valid entry point */
+        if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
+                result = -EADDRNOTAVAIL;
+                goto out;
+        }
+        /* Allocate and initialize a controlling structure */
+        result = do_kimage_alloc(&image, entry, nr_segments, segments);
+        if (result)
+                goto out;
+        /* Enable the special crash kernel control page
+         * allocation policy.
+         */
+        image->control_page = crashk_res.start;
+        image->type = KEXEC_TYPE_CRASH;
+        /*
+         * Verify we have good destination addresses.  Normally
+         * the caller is responsible for making certain we don't
+         * attempt to load the new image into invalid or reserved
+         * areas of RAM.  But crash kernels are preloaded into a
+         * reserved area of ram.  We must ensure the addresses
+         * are in the reserved area otherwise preloading the
+         * kernel could corrupt things.
+         */
+        result = -EADDRNOTAVAIL;
+        for (i = 0; i < nr_segments; i++) {
+                unsigned long mstart, mend;
+                mstart = image->segment[i].mem;
+                mend = mstart + image->segment[i].memsz - 1;
+                /* Ensure we are within the crash kernel limits */
+                if ((mstart < crashk_res.start) || (mend > crashk_res.end))
+                        goto out;
+        }
+        /*
+         * Find a location for the control code buffer, and add
+         * the vector of segments so that it's pages will also be
+         * counted as destination pages.
+         */
+        result = -ENOMEM;
+        image->control_code_page = kimage_alloc_control_pages(image,
+                                           get_order(KEXEC_CONTROL_CODE_SIZE));
+        if (!image->control_code_page) {
+                printk(KERN_ERR "Could not allocate control_code_buffer\n");
+                goto out;
+        }
+        result = 0;
+out:
+        if (result == 0)
+                *rimage = image;
+        else
+                kfree(image);
+        return result;
+}
+static int kimage_is_destination_range(struct kimage *image,
+                                        unsigned long start,
+                                        unsigned long end)
+{
+        unsigned long i;
+        for (i = 0; i < image->nr_segments; i++) {
+                unsigned long mstart, mend;
+                mstart = image->segment[i].mem;
+                mend = mstart + image->segment[i].memsz;
+                if ((end > mstart) && (start < mend))
+                        return 1;
+        }
+        return 0;
+}
+static struct page *kimage_alloc_pages(unsigned int gfp_mask,
+                                        unsigned int order)
+{
+        struct page *pages;
+        pages = alloc_pages(gfp_mask, order);
+        if (pages) {
+                unsigned int count, i;
+                pages->mapping = NULL;
+                pages->private = order;
+                count = 1 << order;
+                for (i = 0; i < count; i++)
+                        SetPageReserved(pages + i);
+        }
+        return pages;
+}
+static void kimage_free_pages(struct page *page)
+{
+        unsigned int order, count, i;
+        order = page->private;
+        count = 1 << order;
+        for (i = 0; i < count; i++)
+                ClearPageReserved(page + i);
+        __free_pages(page, order);
+}
+static void kimage_free_page_list(struct list_head *list)
+{
+        struct list_head *pos, *next;
+        list_for_each_safe(pos, next, list) {
+                struct page *page;
+                page = list_entry(pos, struct page, lru);
+                list_del(&page->lru);
+                kimage_free_pages(page);
+        }
+}
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+                                                        unsigned int order)
+{
+        /* Control pages are special, they are the intermediaries
+         * that are needed while we copy the rest of the pages
+         * to their final resting place.  As such they must
+         * not conflict with either the destination addresses
+         * or memory the kernel is already using.
+         *
+         * The only case where we really need more than one of
+         * these are for architectures where we cannot disable
+         * the MMU and must instead generate an identity mapped
+         * page table for all of the memory.
+         *
+         * At worst this runs in O(N) of the image size.
+         */
+        struct list_head extra_pages;
+        struct page *pages;
+        unsigned int count;
+        count = 1 << order;
+        INIT_LIST_HEAD(&extra_pages);
+        /* Loop while I can allocate a page and the page allocated
+         * is a destination page.
+         */
+        do {
+                unsigned long pfn, epfn, addr, eaddr;
+                pages = kimage_alloc_pages(GFP_KERNEL, order);
+                if (!pages)
+                        break;
+                pfn   = page_to_pfn(pages);
+                epfn  = pfn + count;
+                addr  = pfn << PAGE_SHIFT;
+                eaddr = epfn << PAGE_SHIFT;
+                if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+                              kimage_is_destination_range(image, addr, eaddr)) {
+                        list_add(&pages->lru, &extra_pages);
+                        pages = NULL;
+                }
+        } while (!pages);
+        if (pages) {
+                /* Remember the allocated page... */
+                list_add(&pages->lru, &image->control_pages);
+                /* Because the page is already in it's destination
+                 * location we will never allocate another page at
+                 * that address.  Therefore kimage_alloc_pages
+                 * will not return it (again) and we don't need
+                 * to give it an entry in image->segment[].
+                 */
+        }
+        /* Deal with the destination pages I have inadvertently allocated.
+         *
+         * Ideally I would convert multi-page allocations into single
+         * page allocations, and add everyting to image->dest_pages.
+         *
+         * For now it is simpler to just free the pages.
+         */
+        kimage_free_page_list(&extra_pages);
+        return pages;
+}
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+                                                      unsigned int order)
+{
+        /* Control pages are special, they are the intermediaries
+         * that are needed while we copy the rest of the pages
+         * to their final resting place.  As such they must
+         * not conflict with either the destination addresses
+         * or memory the kernel is already using.
+         *
+         * Control pages are also the only pags we must allocate
+         * when loading a crash kernel.  All of the other pages
+         * are specified by the segments and we just memcpy
+         * into them directly.
+         *
+         * The only case where we really need more than one of
+         * these are for architectures where we cannot disable
+         * the MMU and must instead generate an identity mapped
+         * page table for all of the memory.
+         *
+         * Given the low demand this implements a very simple
+         * allocator that finds the first hole of the appropriate
+         * size in the reserved memory region, and allocates all
+         * of the memory up to and including the hole.
+         */
+        unsigned long hole_start, hole_end, size;
+        struct page *pages;
+        pages = NULL;
+        size = (1 << order) << PAGE_SHIFT;
+        hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+        hole_end   = hole_start + size - 1;
+        while (hole_end <= crashk_res.end) {
+                unsigned long i;
+                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+                        break;
+                if (hole_end > crashk_res.end)
+                        break;
+                /* See if I overlap any of the segments */
+                for (i = 0; i < image->nr_segments; i++) {
+                        unsigned long mstart, mend;
+                        mstart = image->segment[i].mem;
+                        mend   = mstart + image->segment[i].memsz - 1;
+                        if ((hole_end >= mstart) && (hole_start <= mend)) {
+                                /* Advance the hole to the end of the segment */
+                                hole_start = (mend + (size - 1)) & ~(size - 1);
+                                hole_end   = hole_start + size - 1;
+                                break;
+                        }
+                }
+                /* If I don't overlap any segments I have found my hole! */
+                if (i == image->nr_segments) {
+                        pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+                        break;
+                }
+        }
+        if (pages)
+                image->control_page = hole_end;
+        return pages;
+}
+struct page *kimage_alloc_control_pages(struct kimage *image,
+                                         unsigned int order)
+{
+        struct page *pages = NULL;
+        switch (image->type) {
+        case KEXEC_TYPE_DEFAULT:
+                pages = kimage_alloc_normal_control_pages(image, order);
+                break;
+        case KEXEC_TYPE_CRASH:
+                pages = kimage_alloc_crash_control_pages(image, order);
+                break;
+        }
+        return pages;
+}
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+        if (*image->entry != 0)
+                image->entry++;
+        if (image->entry == image->last_entry) {
+                kimage_entry_t *ind_page;
+                struct page *page;
+                page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+                if (!page)
+                        return -ENOMEM;
+                ind_page = page_address(page);
+                *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+                image->entry = ind_page;
+                image->last_entry = ind_page +
+                                      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+        }
+        *image->entry = entry;
+        image->entry++;
+        *image->entry = 0;
+        return 0;
+}
+static int kimage_set_destination(struct kimage *image,
+                                   unsigned long destination)
+{
+        int result;
+        destination &= PAGE_MASK;
+        result = kimage_add_entry(image, destination | IND_DESTINATION);
+        if (result == 0)
+                image->destination = destination;
+        return result;
+}
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+        int result;
+        page &= PAGE_MASK;
+        result = kimage_add_entry(image, page | IND_SOURCE);
+        if (result == 0)
+                image->destination += PAGE_SIZE;
+        return result;
+}
+static void kimage_free_extra_pages(struct kimage *image)
+{
+        /* Walk through and free any extra destination pages I may have */
+        kimage_free_page_list(&image->dest_pages);
+        /* Walk through and free any unuseable pages I have cached */
+        kimage_free_page_list(&image->unuseable_pages);
+}
+static int kimage_terminate(struct kimage *image)
+{
+        if (*image->entry != 0)
+                image->entry++;
+        *image->entry = IND_DONE;
+        return 0;
+}
+#define for_each_kimage_entry(image, ptr, entry) \
+        for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+                ptr = (entry & IND_INDIRECTION)? \
+                        phys_to_virt((entry & PAGE_MASK)): ptr +1)
+static void kimage_free_entry(kimage_entry_t entry)
+{
+        struct page *page;
+        page = pfn_to_page(entry >> PAGE_SHIFT);
+        kimage_free_pages(page);
+}
+static void kimage_free(struct kimage *image)
+{
+        kimage_entry_t *ptr, entry;
+        kimage_entry_t ind = 0;
+        if (!image)
+                return;
+        kimage_free_extra_pages(image);
+        for_each_kimage_entry(image, ptr, entry) {
+                if (entry & IND_INDIRECTION) {
+                        /* Free the previous indirection page */
+                        if (ind & IND_INDIRECTION)
+                                kimage_free_entry(ind);
+                        /* Save this indirection page until we are
+                         * done with it.
+                         */
+                        ind = entry;
+                }
+                else if (entry & IND_SOURCE)
+                        kimage_free_entry(entry);
+        }
+        /* Free the final indirection page */
+        if (ind & IND_INDIRECTION)
+                kimage_free_entry(ind);
+        /* Handle any machine specific cleanup */
+        machine_kexec_cleanup(image);
+        /* Free the kexec control pages... */
+        kimage_free_page_list(&image->control_pages);
+        kfree(image);
+}
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+                                        unsigned long page)
+{
+        kimage_entry_t *ptr, entry;
+        unsigned long destination = 0;
+        for_each_kimage_entry(image, ptr, entry) {
+                if (entry & IND_DESTINATION)
+                        destination = entry & PAGE_MASK;
+                else if (entry & IND_SOURCE) {
+                        if (page == destination)
+                                return ptr;
+                        destination += PAGE_SIZE;
+                }
+        }
+        return 0;
+}
+static struct page *kimage_alloc_page(struct kimage *image,
+                                        unsigned int gfp_mask,
+                                        unsigned long destination)
+{
+        /*
+         * Here we implement safeguards to ensure that a source page
+         * is not copied to its destination page before the data on
+         * the destination page is no longer useful.
+         *
+         * To do this we maintain the invariant that a source page is
+         * either its own destination page, or it is not a
+         * destination page at all.
+         *
+         * That is slightly stronger than required, but the proof
+         * that no problems will not occur is trivial, and the
+         * implementation is simply to verify.
+         *
+         * When allocating all pages normally this algorithm will run
+         * in O(N) time, but in the worst case it will run in O(N^2)
+         * time.   If the runtime is a problem the data structures can
+         * be fixed.
+         */
+        struct page *page;
+        unsigned long addr;
+        /*
+         * Walk through the list of destination pages, and see if I
+         * have a match.
+         */
+        list_for_each_entry(page, &image->dest_pages, lru) {
+                addr = page_to_pfn(page) << PAGE_SHIFT;
+                if (addr == destination) {
+                        list_del(&page->lru);
+                        return page;
+                }
+        }
+        page = NULL;
+        while (1) {
+                kimage_entry_t *old;
+                /* Allocate a page, if we run out of memory give up */
+                page = kimage_alloc_pages(gfp_mask, 0);
+                if (!page)
+                        return 0;
+                /* If the page cannot be used file it away */
+                if (page_to_pfn(page) >
+                                (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+                        list_add(&page->lru, &image->unuseable_pages);
+                        continue;
+                }
+                addr = page_to_pfn(page) << PAGE_SHIFT;
+                /* If it is the destination page we want use it */
+                if (addr == destination)
+                        break;
+                /* If the page is not a destination page use it */
+                if (!kimage_is_destination_range(image, addr,
+                                                  addr + PAGE_SIZE))
+                        break;
+                /*
+                 * I know that the page is someones destination page.
+                 * See if there is already a source page for this
+                 * destination page.  And if so swap the source pages.
+                 */
+                old = kimage_dst_used(image, addr);
+                if (old) {
+                        /* If so move it */
+                        unsigned long old_addr;
+                        struct page *old_page;
+                        old_addr = *old & PAGE_MASK;
+                        old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+                        copy_highpage(page, old_page);
+                        *old = addr | (*old & ~PAGE_MASK);
+                        /* The old page I have found cannot be a
+                         * destination page, so return it.
+                         */
+                        addr = old_addr;
+                        page = old_page;
+                        break;
+                }
+                else {
+                        /* Place the page on the destination list I
+                         * will use it later.
+                         */
+                        list_add(&page->lru, &image->dest_pages);
+                }
+        }
+        return page;
+}
+static int kimage_load_normal_segment(struct kimage *image,
+                                         struct kexec_segment *segment)
+{
+        unsigned long maddr;
+        unsigned long ubytes, mbytes;
+        int result;
+        unsigned char *buf;
+        result = 0;
+        buf = segment->buf;
+        ubytes = segment->bufsz;
+        mbytes = segment->memsz;
+        maddr = segment->mem;
+        result = kimage_set_destination(image, maddr);
+        if (result < 0)
+                goto out;
+        while (mbytes) {
+                struct page *page;
+                char *ptr;
+                size_t uchunk, mchunk;
+                page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+                if (page == 0) {
+                        result  = -ENOMEM;
+                        goto out;
+                }
+                result = kimage_add_page(image, page_to_pfn(page)
+                                                                << PAGE_SHIFT);
+                if (result < 0)
+                        goto out;
+                ptr = kmap(page);
+                /* Start with a clear page */
+                memset(ptr, 0, PAGE_SIZE);
+                ptr += maddr & ~PAGE_MASK;
+                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+                if (mchunk > mbytes)
+                        mchunk = mbytes;
+                uchunk = mchunk;
+                if (uchunk > ubytes)
+                        uchunk = ubytes;
+                result = copy_from_user(ptr, buf, uchunk);
+                kunmap(page);
+                if (result) {
+                        result = (result < 0) ? result : -EIO;
+                        goto out;
+                }
+                ubytes -= uchunk;
+                maddr  += mchunk;
+                buf    += mchunk;
+                mbytes -= mchunk;
+        }
+out:
+        return result;
+}
+static int kimage_load_crash_segment(struct kimage *image,
+                                        struct kexec_segment *segment)
+{
+        /* For crash dumps kernels we simply copy the data from
+         * user space to it's destination.
+         * We do things a page at a time for the sake of kmap.
+         */
+        unsigned long maddr;
+        unsigned long ubytes, mbytes;
+        int result;
+        unsigned char *buf;
+        result = 0;
+        buf = segment->buf;
+        ubytes = segment->bufsz;
+        mbytes = segment->memsz;
+        maddr = segment->mem;
+        while (mbytes) {
+                struct page *page;
+                char *ptr;
+                size_t uchunk, mchunk;
+                page = pfn_to_page(maddr >> PAGE_SHIFT);
+                if (page == 0) {
+                        result  = -ENOMEM;
+                        goto out;
+                }
+                ptr = kmap(page);
+                ptr += maddr & ~PAGE_MASK;
+                mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+                if (mchunk > mbytes)
+                        mchunk = mbytes;
+                uchunk = mchunk;
+                if (uchunk > ubytes) {
+                        uchunk = ubytes;
+                        /* Zero the trailing part of the page */
+                        memset(ptr + uchunk, 0, mchunk - uchunk);
+                }
+                result = copy_from_user(ptr, buf, uchunk);
+                kunmap(page);
+                if (result) {
+                        result = (result < 0) ? result : -EIO;
+                        goto out;
+                }
+                ubytes -= uchunk;
+                maddr  += mchunk;
+                buf    += mchunk;
+                mbytes -= mchunk;
+        }
+out:
+        return result;
+}
+static int kimage_load_segment(struct kimage *image,
+                                struct kexec_segment *segment)
+{
+        int result = -ENOMEM;
+        switch (image->type) {
+        case KEXEC_TYPE_DEFAULT:
+                result = kimage_load_normal_segment(image, segment);
+                break;
+        case KEXEC_TYPE_CRASH:
+                result = kimage_load_crash_segment(image, segment);
+                break;
+        }
+        return result;
+}
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+static struct kimage *kexec_crash_image = NULL;
+/*
+ * A home grown binary mutex.
+ * Nothing can wait so this mutex is safe to use
+ * in interrupt context :)
+ */
+static int kexec_lock = 0;
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+                                struct kexec_segment __user *segments,
+                                unsigned long flags)
+{
+        struct kimage **dest_image, *image;
+        int locked;
+        int result;
+        /* We only trust the superuser with rebooting the system. */
+        if (!capable(CAP_SYS_BOOT))
+                return -EPERM;
+        /*
+         * Verify we have a legal set of flags
+         * This leaves us room for future extensions.
+         */
+        if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+                return -EINVAL;
+        /* Verify we are on the appropriate architecture */
+        if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+                ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+                return -EINVAL;
+        /* Put an artificial cap on the number
+         * of segments passed to kexec_load.
+         */
+        if (nr_segments > KEXEC_SEGMENT_MAX)
+                return -EINVAL;
+        image = NULL;
+        result = 0;
+        /* Because we write directly to the reserved memory
+         * region when loading crash kernels we need a mutex here to
+         * prevent multiple crash  kernels from attempting to load
+         * simultaneously, and to prevent a crash kernel from loading
+         * over the top of a in use crash kernel.
+         *
+         * KISS: always take the mutex.
+         */
+        locked = xchg(&kexec_lock, 1);
+        if (locked)
+                return -EBUSY;
+        dest_image = &kexec_image;
+        if (flags & KEXEC_ON_CRASH)
+                dest_image = &kexec_crash_image;
+        if (nr_segments > 0) {
+                unsigned long i;
+                /* Loading another kernel to reboot into */
+                if ((flags & KEXEC_ON_CRASH) == 0)
+                        result = kimage_normal_alloc(&image, entry,
+                                                        nr_segments, segments);
+                /* Loading another kernel to switch to if this one crashes */
+                else if (flags & KEXEC_ON_CRASH) {
+                        /* Free any current crash dump kernel before
+                         * we corrupt it.
+                         */
+                        kimage_free(xchg(&kexec_crash_image, NULL));
+                        result = kimage_crash_alloc(&image, entry,
+                                                     nr_segments, segments);
+                }
+                if (result)
+                        goto out;
+                result = machine_kexec_prepare(image);
+                if (result)
+                        goto out;
+                for (i = 0; i < nr_segments; i++) {
+                        result = kimage_load_segment(image, &image->segment[i]);
+                        if (result)
+                                goto out;
+                }
+                result = kimage_terminate(image);
+                if (result)
+                        goto out;
+        }
+        /* Install the new kernel, and  Uninstall the old */
+        image = xchg(dest_image, image);
+out:
+        xchg(&kexec_lock, 0); /* Release the mutex */
+        kimage_free(image);
+        return result;
+}
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_kexec_load(unsigned long entry,
+                                unsigned long nr_segments,
+                                struct compat_kexec_segment __user *segments,
+                                unsigned long flags)
+{
+        struct compat_kexec_segment in;
+        struct kexec_segment out, __user *ksegments;
+        unsigned long i, result;
+        /* Don't allow clients that don't understand the native
+         * architecture to do anything.
+         */
+        if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+                return -EINVAL;
+        if (nr_segments > KEXEC_SEGMENT_MAX)
+                return -EINVAL;
+        ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+        for (i=0; i < nr_segments; i++) {
+                result = copy_from_user(&in, &segments[i], sizeof(in));
+                if (result)
+                        return -EFAULT;
+                out.buf   = compat_ptr(in.buf);
+                out.bufsz = in.bufsz;
+                out.mem   = in.mem;
+                out.memsz = in.memsz;
+                result = copy_to_user(&ksegments[i], &out, sizeof(out));
+                if (result)
+                        return -EFAULT;
+        }
+        return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+void crash_kexec(struct pt_regs *regs)
+{
+        struct kimage *image;
+        int locked;
+        /* Take the kexec_lock here to prevent sys_kexec_load
+         * running on one cpu from replacing the crash kernel
+         * we are using after a panic on a different cpu.
+         *
+         * If the crash kernel was not located in a fixed area
+         * of memory the xchg(&kexec_crash_image) would be
+         * sufficient.  But since I reuse the memory...
+         */
+        locked = xchg(&kexec_lock, 1);
+        if (!locked) {
+                image = xchg(&kexec_crash_image, NULL);
+                if (image) {
+                        machine_crash_shutdown(regs);
+                        machine_kexec(image);
+                }
+                xchg(&kexec_lock, 0);
+        }
+}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
 KERNEL_ATTR_RO(hotplug_seqnum);
 #endif
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
+{
+        return sprintf(page, "%p\n", (void *)crash_notes);
+}
+KERNEL_ATTR_RO(crash_notes);
+#endif
 decl_subsys(kernel, NULL, NULL);
 EXPORT_SYMBOL_GPL(kernel_subsys);
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
 #ifdef CONFIG_HOTPLUG
        &hotplug_seqnum_attr.attr,
 #endif
+#ifdef CONFIG_KEXEC
+        &crash_notes_attr.attr,
+#endif
        NULL
 };
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/sysrq.h>
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
+#include <linux/kexec.h>
 int panic_timeout;
 int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
        unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
+        /*
+         * It's possible to come here directly from a panic-assertion and not
+         * have preempt disabled. Some functions called from here want
+         * preempt to be disabled. No point enabling it later though...
+         */
+        preempt_disable();
        bust_spinlocks(1);
        va_start(args, fmt);
        vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
        printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
        bust_spinlocks(0);
+        /*
+         * If we have crashed and we have a crash kernel loaded let it handle
+         * everything else.
+         * Do we want to call this before we try to display a message?
+         */
+        crash_kexec(NULL);
 #ifdef CONFIG_SMP
+        /*
+         * Note smp_send_stop is the usual smp shutdown function, which
+         * unfortunately means it may not be hardened to work in a panic
+         * situation.
+         */
        smp_send_stop();
 #endif
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
        if (!panic_blink)
                panic_blink = no_blink;
-        if (panic_timeout > 0)
+        if (panic_timeout > 0) {
-        {
                /*
                 * Delay timeout seconds before rebooting the machine. 
                 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
        like suspend support.
 config SOFTWARE_SUSPEND
-        bool "Software Suspend (EXPERIMENTAL)"
+        bool "Software Suspend"
-        depends on EXPERIMENTAL && PM && SWAP
+        depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
        ---help---
          Enable the possibility of suspending the machine.
          It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
          suspended image to. It will simply pick the first available swap 
          device.
+config SUSPEND_SMP
+        bool
+        depends on HOTPLUG_CPU && X86 && PM
+        default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS    +=      -DDEBUG
 endif
-swsusp-smp-$(CONFIG_SMP)        += smp.o
 obj-y                           := main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o $(swsusp-smp-y) disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)  += swsusp.o disk.o
+obj-$(CONFIG_SUSPEND_SMP)       += smp.o
 obj-$(CONFIG_MAGIC_SYSRQ)       += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
 {
        device_resume();
        platform_finish();
-        enable_nonboot_cpus();
        thaw_processes();
+        enable_nonboot_cpus();
        pm_restore_console();
 }
@@ -131,28 +131,35 @@ static int prepare_processes(void)
        sys_sync();
+        disable_nonboot_cpus();
        if (freeze_processes()) {
                error = -EBUSY;
-                return error;
+                goto thaw;
        }
        if (pm_disk_mode == PM_DISK_PLATFORM) {
                if (pm_ops && pm_ops->prepare) {
                        if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
-                                return error;
+                                goto thaw;
                }
        }
        /* Free memory before shutting down devices. */
        free_some_memory();
        return 0;
+thaw:
+        thaw_processes();
+        enable_nonboot_cpus();
+        pm_restore_console();
+        return error;
 }
 static void unprepare_processes(void)
 {
-        enable_nonboot_cpus();
+        platform_finish();
        thaw_processes();
+        enable_nonboot_cpus();
        pm_restore_console();
 }
@@ -160,15 +167,9 @@ static int prepare_devices(void)
 {
        int error;
-        disable_nonboot_cpus();
+        if ((error = device_suspend(PMSG_FREEZE)))
-        if ((error = device_suspend(PMSG_FREEZE))) {
                printk("Some devices failed to suspend\n");
-                platform_finish();
+        return error;
-                enable_nonboot_cpus();
-                return error;
-        }
-        return 0;
 }
 /**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
        int error;
        error = prepare_processes();
-        if (!error) {
+        if (error)
-                error = prepare_devices();
+                return error;
-        }
+        error = prepare_devices();
        if (error) {
                unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
        if ((error = prepare_processes())) {
                swsusp_close();
-                goto Cleanup;
+                goto Done;
        }
        pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
        pm_prepare_console();
+        disable_nonboot_cpus();
+        if (num_online_cpus() != 1) {
+                error = -EPERM;
+                goto Enable_cpu;
+        }
        if (freeze_processes()) {
                error = -EAGAIN;
                goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
                pm_ops->finish(state);
 Thaw:
        thaw_processes();
+ Enable_cpu:
+        enable_nonboot_cpus();
        pm_restore_console();
        return error;
 }
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
        if (pm_ops && pm_ops->finish)
                pm_ops->finish(state);
        thaw_processes();
+        enable_nonboot_cpus();
        pm_restore_console();
 }
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
                goto Unlock;
        }
-        /* Suspend is hard to get right on SMP. */
-        if (num_online_cpus() != 1) {
-                error = -EPERM;
-                goto Unlock;
-        }
        pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
        if ((error = suspend_prepare(state)))
                goto Unlock;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
 }
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(unsigned long flag)
+void refrigerator(void)
 {
        /* Hmm, should we be allowed to suspend when there are realtime
           processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
        current->state = TASK_UNINTERRUPTIBLE;
        pr_debug("%s entered refrigerator\n", current->comm);
        printk("=");
-        current->flags &= ~PF_FREEZE;
+        frozen_process(current);
        spin_lock_irq(&current->sighand->siglock);
        recalc_sigpending(); /* We sent fake signal, clean it up */
        spin_unlock_irq(&current->sighand->siglock);
-        current->flags |= PF_FROZEN;
+        while (frozen(current))
-        while (current->flags & PF_FROZEN)
                schedule();
        pr_debug("%s left refrigerator\n", current->comm);
        current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-       int todo;
+        int todo;
-       unsigned long start_time;
+        unsigned long start_time;
        struct task_struct *g, *p;
-        
        printk( "Stopping tasks: " );
        start_time = jiffies;
        do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
                        unsigned long flags;
                        if (!freezeable(p))
                                continue;
-                        if ((p->flags & PF_FROZEN) ||
+                        if ((frozen(p)) ||
                            (p->state == TASK_TRACED) ||
                            (p->state == TASK_STOPPED))
                                continue;
-                        /* FIXME: smp problem here: we may not access other process' flags
+                        freeze(p);
-                           without locking */
-                        p->flags |= PF_FREEZE;
                        spin_lock_irqsave(&p->sighand->siglock, flags);
                        signal_wake_up(p, 0);
                        spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
                        return todo;
                }
        } while(todo);
-        
        printk( "|\n" );
        BUG_ON(in_atomic());
        return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
        do_each_thread(g, p) {
                if (!freezeable(p))
                        continue;
-                if (p->flags & PF_FROZEN) {
+                if (!thaw_process(p))
-                        p->flags &= ~PF_FROZEN;
-                        wake_up_process(p);
-                } else
                        printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
        } while_each_thread(g, p);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 457c2302ed42..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/tlbflush.h>
-static atomic_t cpu_counter, freeze;
+/* This is protected by pm_sem semaphore */
+static cpumask_t frozen_cpus;
-static void smp_pause(void * data)
-{
-        struct saved_context ctxt;
-        __save_processor_state(&ctxt);
-        printk("Sleeping in:\n");
-        dump_stack();
-        atomic_inc(&cpu_counter);
-        while (atomic_read(&freeze)) {
-                /* FIXME: restore takes place at random piece inside this.
-                   This should probably be written in assembly, and
-                   preserve general-purpose registers, too
-                   What about stack? We may need to move to new stack here.
-                   This should better be ran with interrupts disabled.
-                 */
-                cpu_relax();
-                barrier();
-        }
-        atomic_dec(&cpu_counter);
-        __restore_processor_state(&ctxt);
-}
-static cpumask_t oldmask;
 void disable_nonboot_cpus(void)
 {
-        oldmask = current->cpus_allowed;
+        int cpu, error;
-        set_cpus_allowed(current, cpumask_of_cpu(0));
-        printk("Freezing CPUs (at %d)", raw_smp_processor_id());
-        current->state = TASK_INTERRUPTIBLE;
-        schedule_timeout(HZ);
-        printk("...");
-        BUG_ON(raw_smp_processor_id() != 0);
-        /* FIXME: for this to work, all the CPUs must be running
-         * "idle" thread (or we deadlock). Is that guaranteed? */
-        atomic_set(&cpu_counter, 0);
+        error = 0;
-        atomic_set(&freeze, 1);
+        cpus_clear(frozen_cpus);
-        smp_call_function(smp_pause, NULL, 0, 0);
+        printk("Freezing cpus ...\n");
-        while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
+        for_each_online_cpu(cpu) {
-                cpu_relax();
+                if (cpu == 0)
-                barrier();
+                        continue;
+                error = cpu_down(cpu);
+                if (!error) {
+                        cpu_set(cpu, frozen_cpus);
+                        printk("CPU%d is down\n", cpu);
+                        continue;
+                }
+                printk("Error taking cpu %d down: %d\n", cpu, error);
        }
-        printk("ok\n");
+        BUG_ON(smp_processor_id() != 0);
+        if (error)
+                panic("cpus not sleeping");
 }
 void enable_nonboot_cpus(void)
 {
-        printk("Restarting CPUs");
+        int cpu, error;
-        atomic_set(&freeze, 0);
-        while (atomic_read(&cpu_counter)) {
-                cpu_relax();
-                barrier();
-        }
-        printk("...");
-        set_cpus_allowed(current, oldmask);
-        schedule();
-        printk("ok\n");
+        printk("Thawing cpus ...\n");
+        for_each_cpu_mask(cpu, frozen_cpus) {
+                error = smp_prepare_cpu(cpu);
+                if (!error)
+                        error = cpu_up(cpu);
+                if (!error) {
+                        printk("CPU%d is up\n", cpu);
+                        continue;
+                }
+                printk("Error taking cpu %d up: %d\n", cpu, error);
+                panic("Not enough cpus");
+        }
+        cpus_clear(frozen_cpus);
 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 53f9f8720ee4..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
 * This file is released under the GPLv2.
 *
 * I'd like to thank the following people for their work:
- * 
+ *
 * Pavel Machek <pavel@ucw.cz>:
 * Modifications, defectiveness pointing, being with me at the very beginning,
 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
 *
- * Steve Doddi <dirk@loth.demon.co.uk>: 
+ * Steve Doddi <dirk@loth.demon.co.uk>:
 * Support the possibility of hardware state restoring.
 *
 * Raph <grey.havens@earthling.net>:
@@ -84,11 +84,11 @@ extern char resume_file[];
 static unsigned int nr_copy_pages __nosavedata = 0;
 /* Suspend pagedir is allocated before final copy, therefore it
-   must be freed after resume 
+   must be freed after resume
   Warning: this is evil. There are actually two pagedirs at time of
   resume. One is "pagedir_save", which is empty frame allocated at
-   time of suspend, that must be freed. Second is "pagedir_nosave", 
+   time of suspend, that must be freed. Second is "pagedir_nosave",
   allocated at time of resume, that travels through memory not to
   collide with anything.
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
 {
        int error;
-        rw_swap_page_sync(READ, 
+        rw_swap_page_sync(READ,
                          swp_entry(root_swap, 0),
                          virt_to_page((unsigned long)&swsusp_header));
        if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
                memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
                memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
                swsusp_header.swsusp_info = prev;
-                error = rw_swap_page_sync(WRITE, 
+                error = rw_swap_page_sync(WRITE,
                                          swp_entry(root_swap, 0),
                                          virt_to_page((unsigned long)
                                                       &swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
        int i, len;
-        
        len=strlen(resume_file);
        root_swap = 0xFFFF;
-        
        swap_list_lock();
-        for(i=0; i<MAX_SWAPFILES; i++) {
+        for (i=0; i<MAX_SWAPFILES; i++) {
                if (swap_info[i].flags == 0) {
                        swapfile_used[i]=SWAPFILE_UNUSED;
                } else {
-                        if(!len) {
+                        if (!len) {
                                printk(KERN_WARNING "resume= option should be used to set suspend device" );
-                                if(root_swap == 0xFFFF) {
+                                if (root_swap == 0xFFFF) {
                                        swapfile_used[i] = SWAPFILE_SUSPEND;
                                        root_swap = i;
                                } else
-                                        swapfile_used[i] = SWAPFILE_IGNORED;                              
+                                        swapfile_used[i] = SWAPFILE_IGNORED;
                        } else {
                                /* we ignore all swap devices that are not the resume_file */
                                if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 * This is called after saving image so modification
 * will be lost after resume... and that's what we want.
 * we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices. 
+ * lock_swapdevices can unlock the devices.
 */
 static void lock_swapdevices(void)
 {
        int i;
        swap_list_lock();
-        for(i = 0; i< MAX_SWAPFILES; i++)
+        for (i = 0; i< MAX_SWAPFILES; i++)
-                if(swapfile_used[i] == SWAPFILE_IGNORED) {
+                if (swapfile_used[i] == SWAPFILE_IGNORED) {
                        swap_info[i].flags ^= 0xFF;
                }
        swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
 *      @loc:   Place to store the entry we used.
 *
 *      Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *      errors. That is an artifact left over from swsusp. It did not 
+ *      errors. That is an artifact left over from swsusp. It did not
 *      check the return of rw_swap_page_sync() at all, since most pages
 *      written back to swap would return -EIO.
 *      This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
        int error = 0;
        entry = get_swap_page();
-        if (swp_offset(entry) && 
+        if (swp_offset(entry) &&
            swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
                error = rw_swap_page_sync(WRITE, entry,
                                          virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 /**
 *      data_free - Free the swap entries used by the saved image.
 *
- *      Walk the list of used swap entries and free each one. 
+ *      Walk the list of used swap entries and free each one.
 *      This is only used for cleanup when suspend fails.
 */
 static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
                mod = 1;
        printk( "Writing data to swap (%d pages)...     ", nr_copy_pages );
-        for_each_pbe(p, pagedir_nosave) {
+        for_each_pbe (p, pagedir_nosave) {
                if (!(i%mod))
                        printk( "\b\b\b\b%3d%%", i / mod );
                if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
        dump_info();
        error = write_page((unsigned long)&swsusp_info, &entry);
-        if (!error) { 
+        if (!error) {
                printk( "S" );
                error = mark_swapfiles(entry);
                printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
        struct pbe * pbe;
        printk( "Writing pagedir...");
-        for_each_pb_page(pbe, pagedir_nosave) {
+        for_each_pb_page (pbe, pagedir_nosave) {
                if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
                        return error;
        }
@@ -472,7 +472,7 @@ static int save_highmem(void)
        int res = 0;
        pr_debug("swsusp: Saving Highmem\n");
-        for_each_zone(zone) {
+        for_each_zone (zone) {
                if (is_highmem(zone))
                        res = save_highmem_zone(zone);
                if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
        nr_copy_pages = 0;
-        for_each_zone(zone) {
+        for_each_zone (zone) {
                if (is_highmem(zone))
                        continue;
                mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
        struct zone *zone;
        unsigned long zone_pfn;
        struct pbe * pbe = pagedir_nosave;
-        
        pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-        for_each_zone(zone) {
+        for_each_zone (zone) {
                if (is_highmem(zone))
                        continue;
                mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
 {
        struct pbe * p;
-        for_each_pbe(p, pagedir_save) {
+        for_each_pbe (p, pagedir_save) {
                if (p->address) {
                        ClearPageNosave(virt_to_page(p->address));
                        free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
 {
        struct pbe * p;
-        for_each_pbe(p, pagedir_save) {
+        for_each_pbe (p, pagedir_save) {
                p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
                if (!p->address)
                        return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
 /**
 *      enough_free_mem - Make sure we enough free memory to snapshot.
 *
- *      Returns TRUE or FALSE after checking the number of available 
+ *      Returns TRUE or FALSE after checking the number of available
 *      free pages.
 */
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
 /**
 *      enough_swap - Make sure we have enough swap to save the image.
 *
- *      Returns TRUE or FALSE after checking the total amount of swap 
+ *      Returns TRUE or FALSE after checking the total amount of swap
 *      space avaiable.
 *
 *      FIXME: si_swapinfo(&i) returns all swap devices information.
- *      We should only consider resume_device. 
+ *      We should only consider resume_device.
 */
 static int enough_swap(void)
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
 {
        int error;
+        pagedir_nosave = NULL;
+        nr_copy_pages = calc_nr(nr_copy_pages);
        pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
                 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-        pagedir_nosave = NULL;
        if (!enough_free_mem())
                return -ENOMEM;
        if (!enough_swap())
                return -ENOSPC;
-        nr_copy_pages = calc_nr(nr_copy_pages);
        if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
                printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
                return -ENOMEM;
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
        error = swsusp_alloc();
        if (error)
                return error;
-        
-        /* During allocating of suspend pagedir, new cold pages may appear. 
+        /* During allocating of suspend pagedir, new cold pages may appear.
         * Kill them.
         */
        drain_local_pages();
@@ -929,21 +929,6 @@ int swsusp_resume(void)
        return error;
 }
-/* More restore stuff */
-/*
- * Returns true if given address/order collides with any orig_address 
- */
-static int does_collide_order(unsigned long addr, int order)
-{
-        int i;
-        
-        for (i=0; i < (1<<order); i++)
-                if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
-                        return 1;
-        return 0;
-}
 /**
 *      On resume, for storing the PBE list and the image,
 *      we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
        unsigned long m;
        m = get_zeroed_page(gfp_mask);
-        while (does_collide_order(m, 0)) {
+        while (!PageNosaveFree(virt_to_page(m))) {
                eat_page((void *)m);
                m = get_zeroed_page(gfp_mask);
                if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
        /* Set page flags */
-        for_each_zone(zone) {
+        for_each_zone (zone) {
                for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
                        SetPageNosaveFree(pfn_to_page(zone_pfn +
                                        zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
        /* Relocate colliding pages */
        for_each_pb_page (pbpage, pblist) {
-                if (does_collide_order((unsigned long)pbpage, 0)) {
+                if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
                        m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
                        if (!m) {
                                error = -ENOMEM;
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
                return "version";
        if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
                return "machine";
+#if 0
        if(swsusp_info.cpus != num_online_cpus())
                return "number of cpus";
+#endif
        return NULL;
 }
diff --git a/kernel/printk.c b/kernel/printk.c
index 3a442bfb8bee..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                        log_level_unknown = 1;
        }
-        if (!cpu_online(smp_processor_id()) &&
+        if (!cpu_online(smp_processor_id())) {
-            system_state != SYSTEM_RUNNING) {
                /*
                 * Some console drivers may assume that per-cpu resources have
                 * been allocated.  So don't allow them to be called by this
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
                        new->start = min;
                if (new->end > max)
                        new->end = max;
-                new->start = (new->start + align - 1) & ~(align - 1);
+                new->start = ALIGN(new->start, align);
                if (alignf)
                        alignf(alignf_data, new, size, align);
                if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..a07cff90d849 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
 #define SCALE_PRIO(x, prio) \
        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
 {
        if (p->static_prio < NICE_TO_PRIO(0))
                return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
-        unsigned long cpu_load;
+        unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
@@ -260,22 +260,86 @@ struct runqueue {
 static DEFINE_PER_CPU(struct runqueue, runqueues);
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
 #define for_each_domain(cpu, domain) \
-        for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 #define this_rq()               (&__get_cpu_var(runqueues))
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next)  do { } while (0)
+# define prepare_arch_switch(next)      do { } while (0)
-# define finish_arch_switch(rq, next)   spin_unlock_irq(&(rq)->lock)
+#endif
-# define task_running(rq, p)            ((rq)->curr == (p))
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)       do { } while (0)
+#endif
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+        return rq->curr == p;
+}
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+        spin_unlock_irq(&rq->lock);
+}
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+        return p->oncpu;
+#else
+        return rq->curr == p;
+#endif
+}
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        spin_unlock_irq(&rq->lock);
+#else
+        spin_unlock(&rq->lock);
+#endif
+}
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+        /*
+         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->oncpu = 0;
 #endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
 * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
 * bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 #ifdef CONFIG_SMP
                /* domain-specific stats */
+                preempt_disable();
                for_each_domain(cpu, sd) {
                        enum idle_type itype;
                        char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
                                    sd->lb_nobusyq[itype],
                                    sd->lb_nobusyg[itype]);
                        }
-                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+                        seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
                            sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-                            sd->sbe_pushed, sd->sbe_attempts,
+                            sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+                            sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
                            sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
                }
+                preempt_enable();
 #endif
        }
        return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
        return rq;
 }
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-        int sib;
-        for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-                if (idle_cpu(sib))
-                        continue;
-                return 0;
-        }
-        return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
 #ifdef CONFIG_SCHEDSTATS
 /*
 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
        rq->nr_running++;
 }
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
 {
        /* Caller must always ensure 'now >= p->timestamp' */
        unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
                }
        }
-        p->prio = effective_prio(p);
+        return effective_prio(p);
 }
 /*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
        }
 #endif
-        recalc_task_prio(p, now);
+        p->prio = recalc_task_prio(p, now);
        /*
         * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
 }
 #ifdef CONFIG_SMP
-enum request_type {
-        REQ_MOVE_TASK,
-        REQ_SET_DOMAIN,
-};
 typedef struct {
        struct list_head list;
-        enum request_type type;
-        /* For REQ_MOVE_TASK */
        task_t *task;
        int dest_cpu;
-        /* For REQ_SET_DOMAIN */
-        struct sched_domain *sd;
        struct completion done;
 } migration_req_t;
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
        }
        init_completion(&req->done);
-        req->type = REQ_MOVE_TASK;
        req->task = p;
        req->dest_cpu = dest_cpu;
        list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
 * We want to under-estimate the load of migration sources, to
 * balance conservatively.
 */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        if (type == 0)
+                return load_now;
-        return min(rq->cpu_load, load_now);
+        return min(rq->cpu_load[type-1], load_now);
 }
 /*
 * Return a high guess at the load of a migration-target cpu
 */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+        if (type == 0)
+                return load_now;
-        return max(rq->cpu_load, load_now);
+        return max(rq->cpu_load[type-1], load_now);
 }
-#endif
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+        unsigned long min_load = ULONG_MAX, this_load = 0;
+        int load_idx = sd->forkexec_idx;
+        int imbalance = 100 + (sd->imbalance_pct-100)/2;
+        do {
+                unsigned long load, avg_load;
+                int local_group;
+                int i;
+                local_group = cpu_isset(this_cpu, group->cpumask);
+                /* XXX: put a cpus allowed check */
+                /* Tally up the load of all CPUs in the group */
+                avg_load = 0;
+                for_each_cpu_mask(i, group->cpumask) {
+                        /* Bias balancing toward cpus of our domain */
+                        if (local_group)
+                                load = source_load(i, load_idx);
+                        else
+                                load = target_load(i, load_idx);
+                        avg_load += load;
+                }
+                /* Adjust by relative CPU power of the group */
+                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                if (local_group) {
+                        this_load = avg_load;
+                        this = group;
+                } else if (avg_load < min_load) {
+                        min_load = avg_load;
+                        idlest = group;
+                }
+                group = group->next;
+        } while (group != sd->groups);
+        if (!idlest || 100*this_load < imbalance*min_load)
+                return NULL;
+        return idlest;
+}
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+        unsigned long load, min_load = ULONG_MAX;
+        int idlest = -1;
+        int i;
+        for_each_cpu_mask(i, group->cpumask) {
+                load = source_load(i, 0);
+                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        min_load = load;
+                        idlest = i;
+                }
+        }
+        return idlest;
+}
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+        struct task_struct *t = current;
+        struct sched_domain *tmp, *sd = NULL;
+        for_each_domain(cpu, tmp)
+                if (tmp->flags & flag)
+                        sd = tmp;
+        while (sd) {
+                cpumask_t span;
+                struct sched_group *group;
+                int new_cpu;
+                int weight;
+                span = sd->span;
+                group = find_idlest_group(sd, t, cpu);
+                if (!group)
+                        goto nextlevel;
+                new_cpu = find_idlest_cpu(group, cpu);
+                if (new_cpu == -1 || new_cpu == cpu)
+                        goto nextlevel;
+                /* Now try balancing at a lower domain level */
+                cpu = new_cpu;
+nextlevel:
+                sd = NULL;
+                weight = cpus_weight(span);
+                for_each_domain(cpu, tmp) {
+                        if (weight <= cpus_weight(tmp->span))
+                                break;
+                        if (tmp->flags & flag)
+                                sd = tmp;
+                }
+                /* while loop will break here if sd == NULL */
+        }
+        return cpu;
+}
+#endif /* CONFIG_SMP */
 /*
 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_IDLE) {
-                        cpus_and(tmp, sd->span, cpu_online_map);
+                        cpus_and(tmp, sd->span, p->cpus_allowed);
-                        cpus_and(tmp, tmp, p->cpus_allowed);
                        for_each_cpu_mask(i, tmp) {
                                if (idle_cpu(i))
                                        return i;
                        }
                }
-                else break;
+                else
+                        break;
        }
        return cpu;
 }
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        runqueue_t *rq;
 #ifdef CONFIG_SMP
        unsigned long load, this_load;
-        struct sched_domain *sd;
+        struct sched_domain *sd, *this_sd = NULL;
        int new_cpu;
 #endif
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
-#ifdef CONFIG_SCHEDSTATS
+        new_cpu = cpu;
        schedstat_inc(rq, ttwu_cnt);
        if (cpu == this_cpu) {
                schedstat_inc(rq, ttwu_local);
-        } else {
+                goto out_set_cpu;
-                for_each_domain(this_cpu, sd) {
+        }
-                        if (cpu_isset(cpu, sd->span)) {
-                                schedstat_inc(sd, ttwu_wake_remote);
+        for_each_domain(this_cpu, sd) {
-                                break;
+                if (cpu_isset(cpu, sd->span)) {
-                        }
+                        schedstat_inc(sd, ttwu_wake_remote);
+                        this_sd = sd;
+                        break;
                }
        }
-#endif
-        new_cpu = cpu;
+        if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-        if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                goto out_set_cpu;
-        load = source_load(cpu);
-        this_load = target_load(this_cpu);
        /*
-         * If sync wakeup then subtract the (maximum possible) effect of
+         * Check for affine wakeup and passive balancing possibilities.
-         * the currently running task from the load of the current CPU:
         */
-        if (sync)
+        if (this_sd) {
-                this_load -= SCHED_LOAD_SCALE;
+                int idx = this_sd->wake_idx;
+                unsigned int imbalance;
-        /* Don't pull the task off an idle CPU to a busy one */
+                imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-        if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-                goto out_set_cpu;
-        new_cpu = this_cpu; /* Wake to this CPU if we can */
+                load = source_load(cpu, idx);
+                this_load = target_load(this_cpu, idx);
-        /*
+                new_cpu = this_cpu; /* Wake to this CPU if we can */
-         * Scan domains for affine wakeup and passive balancing
-         * possibilities.
-         */
-        for_each_domain(this_cpu, sd) {
-                unsigned int imbalance;
-                /*
-                 * Start passive balancing when half the imbalance_pct
-                 * limit is reached.
-                 */
-                imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
-                if ((sd->flags & SD_WAKE_AFFINE) &&
+                if (this_sd->flags & SD_WAKE_AFFINE) {
-                                !task_hot(p, rq->timestamp_last_tick, sd)) {
+                        unsigned long tl = this_load;
                        /*
-                         * This domain has SD_WAKE_AFFINE and p is cache cold
+                         * If sync wakeup then subtract the (maximum possible)
-                         * in this domain.
+                         * effect of the currently running task from the load
+                         * of the current CPU:
                         */
-                        if (cpu_isset(cpu, sd->span)) {
+                        if (sync)
-                                schedstat_inc(sd, ttwu_move_affine);
+                                tl -= SCHED_LOAD_SCALE;
+                        if ((tl <= load &&
+                                tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+                                100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                                /*
+                                 * This domain has SD_WAKE_AFFINE and
+                                 * p is cache cold in this domain, and
+                                 * there is no bad imbalance.
+                                 */
+                                schedstat_inc(this_sd, ttwu_move_affine);
                                goto out_set_cpu;
                        }
-                } else if ((sd->flags & SD_WAKE_BALANCE) &&
+                }
-                                imbalance*this_load <= 100*load) {
-                        /*
+                /*
-                         * This domain has SD_WAKE_BALANCE and there is
+                 * Start passive balancing when half the imbalance_pct
-                         * an imbalance.
+                 * limit is reached.
-                         */
+                 */
-                        if (cpu_isset(cpu, sd->span)) {
+                if (this_sd->flags & SD_WAKE_BALANCE) {
-                                schedstat_inc(sd, ttwu_move_balance);
+                        if (imbalance*this_load <= 100*load) {
+                                schedstat_inc(this_sd, ttwu_move_balance);
                                goto out_set_cpu;
                        }
                }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
        return try_to_wake_up(p, state, 0);
 }
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                           struct sched_domain *sd);
-#endif
 /*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
 */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
 {
+        int cpu = get_cpu();
+#ifdef CONFIG_SMP
+        cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+        set_task_cpu(p, cpu);
        /*
         * We mark the process as running here, but have not actually
         * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
        p->state = TASK_RUNNING;
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
-        spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+        p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-        /*
+        /* Want to start with kernel preemption disabled. */
-         * During context-switch we hold precisely one spinlock, which
-         * schedule_tail drops. (in the common case it's this_rq()->lock,
-         * but it also can be p->switch_lock.) So we compensate with a count
-         * of 1. Also, we want to start with kernel preemption disabled.
-         */
        p->thread_info->preempt_count = 1;
 #endif
        /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
                 * runqueue lock is not a problem.
                 */
                current->time_slice = 1;
-                preempt_disable();
                scheduler_tick();
-                local_irq_enable();
+        }
-                preempt_enable();
+        local_irq_enable();
-        } else
+        put_cpu();
-                local_irq_enable();
 }
 /*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
        runqueue_t *rq, *this_rq;
        rq = task_rq_lock(p, &flags);
-        cpu = task_cpu(p);
-        this_cpu = smp_processor_id();
        BUG_ON(p->state != TASK_RUNNING);
+        this_cpu = smp_processor_id();
+        cpu = task_cpu(p);
        /*
         * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
 }
 /**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+        prepare_lock_switch(rq, next);
+        prepare_arch_switch(next);
+}
+/**
 * finish_task_switch - clean up after a task-switch
 * @prev: the thread we just switched away from.
 *
- * We enter this with the runqueue still locked, and finish_arch_switch()
+ * finish_task_switch must be called after the context switch, paired
- * will unlock it along with doing any other architecture-specific cleanup
+ * with a prepare_task_switch call before the context switch.
- * actions.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock.  (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
        __releases(rq->lock)
 {
-        runqueue_t *rq = this_rq();
        struct mm_struct *mm = rq->prev_mm;
        unsigned long prev_task_flags;
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
-        finish_arch_switch(rq, prev);
+        finish_arch_switch(prev);
+        finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
        __releases(rq->lock)
 {
-        finish_task_switch(prev);
+        runqueue_t *rq = this_rq();
+        finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+        /* In this case, finish_task_switch does not reenable preemption */
+        preempt_enable();
+#endif
        if (current->set_child_tid)
                put_user(current->pid, current->set_child_tid);
 }
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 }
 /*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-                           struct sched_domain *sd)
-{
-        unsigned long load, min_load, this_load;
-        int i, min_cpu;
-        cpumask_t mask;
-        min_cpu = UINT_MAX;
-        min_load = ULONG_MAX;
-        cpus_and(mask, sd->span, p->cpus_allowed);
-        for_each_cpu_mask(i, mask) {
-                load = target_load(i);
-                if (load < min_load) {
-                        min_cpu = i;
-                        min_load = load;
-                        /* break out early on an idle CPU: */
-                        if (!min_load)
-                                break;
-                }
-        }
-        /* add +1 to account for the new task */
-        this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
-        /*
-         * Would with the addition of the new task to the
-         * current CPU there be an imbalance between this
-         * CPU and the idlest CPU?
-         *
-         * Use half of the balancing threshold - new-context is
-         * a good opportunity to balance.
-         */
-        if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-                return min_cpu;
-        return this_cpu;
-}
-/*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
 * allow dest_cpu, which will force the cpu onto dest_cpu.  Then
@@ -1571,37 +1712,16 @@ out:
 }
 /*
- * sched_exec(): find the highest-level, exec-balance-capable
+ * sched_exec - execve() is a valuable balancing opportunity, because at
- * domain and try to migrate the task to the least loaded CPU.
+ * this point the task has the smallest effective memory and cache footprint.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
 */
 void sched_exec(void)
 {
-        struct sched_domain *tmp, *sd = NULL;
        int new_cpu, this_cpu = get_cpu();
+        new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
-        /* Prefer the current CPU if there's only this task running */
-        if (this_rq()->nr_running <= 1)
-                goto out;
-        for_each_domain(this_cpu, tmp)
-                if (tmp->flags & SD_BALANCE_EXEC)
-                        sd = tmp;
-        if (sd) {
-                schedstat_inc(sd, sbe_attempts);
-                new_cpu = find_idlest_cpu(current, this_cpu, sd);
-                if (new_cpu != this_cpu) {
-                        schedstat_inc(sd, sbe_pushed);
-                        put_cpu();
-                        sched_migrate_task(current, new_cpu);
-                        return;
-                }
-        }
-out:
        put_cpu();
+        if (new_cpu != this_cpu)
+                sched_migrate_task(current, new_cpu);
 }
 /*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-                     struct sched_domain *sd, enum idle_type idle)
+             struct sched_domain *sd, enum idle_type idle, int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-        if (task_running(rq, p))
-                return 0;
        if (!cpu_isset(this_cpu, p->cpus_allowed))
                return 0;
+        *all_pinned = 0;
+        if (task_running(rq, p))
+                return 0;
        /*
         * Aggressive migration if:
-         * 1) the [whole] cpu is idle, or
+         * 1) task is cache cold, or
         * 2) too many balance attempts have failed.
         */
-        if (cpu_and_siblings_are_idle(this_cpu) || \
+        if (sd->nr_balance_failed > sd->cache_nice_tries)
-                        sd->nr_balance_failed > sd->cache_nice_tries)
                return 1;
        if (task_hot(p, rq->timestamp_last_tick, sd))
-                        return 0;
+                return 0;
        return 1;
 }
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
                      unsigned long max_nr_move, struct sched_domain *sd,
-                      enum idle_type idle)
+                      enum idle_type idle, int *all_pinned)
 {
        prio_array_t *array, *dst_array;
        struct list_head *head, *curr;
-        int idx, pulled = 0;
+        int idx, pulled = 0, pinned = 0;
        task_t *tmp;
-        if (max_nr_move <= 0 || busiest->nr_running <= 1)
+        if (max_nr_move == 0)
                goto out;
+        pinned = 1;
        /*
         * We first consider expired tasks. Those will likely not be
         * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
        curr = curr->prev;
-        if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+        if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
                if (curr != head)
                        goto skip_queue;
                idx++;
@@ -1746,6 +1869,9 @@ out:
         * inside pull_task().
         */
        schedstat_add(sd, lb_gained[idle], pulled);
+        if (all_pinned)
+                *all_pinned = pinned;
        return pulled;
 }
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+        int load_idx;
        max_load = this_load = total_load = total_pwr = 0;
+        if (idle == NOT_IDLE)
+                load_idx = sd->busy_idx;
+        else if (idle == NEWLY_IDLE)
+                load_idx = sd->newidle_idx;
+        else
+                load_idx = sd->idle_idx;
        do {
                unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                for_each_cpu_mask(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                                load = target_load(i);
+                                load = target_load(i, load_idx);
                        else
-                                load = source_load(i);
+                                load = source_load(i, load_idx);
                        avg_load += load;
                }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                if (local_group) {
                        this_load = avg_load;
                        this = group;
-                        goto nextgroup;
                } else if (avg_load > max_load) {
                        max_load = avg_load;
                        busiest = group;
                }
-nextgroup:
                group = group->next;
        } while (group != sd->groups);
@@ -1870,15 +2001,9 @@ nextgroup:
        /* Get rid of the scaling factor, rounding down as we divide */
        *imbalance = *imbalance / SCHED_LOAD_SCALE;
        return busiest;
 out_balanced:
-        if (busiest && (idle == NEWLY_IDLE ||
-                        (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-                *imbalance = 1;
-                return busiest;
-        }
        *imbalance = 0;
        return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
        int i;
        for_each_cpu_mask(i, group->cpumask) {
-                load = source_load(i);
+                load = source_load(i, 0);
                if (load > max_load) {
                        max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 }
 /*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL     512
+/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        struct sched_group *group;
        runqueue_t *busiest;
        unsigned long imbalance;
-        int nr_moved;
+        int nr_moved, all_pinned = 0;
+        int active_balance = 0;
        spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
-        /*
+        BUG_ON(busiest == this_rq);
-         * This should be "impossible", but since load
-         * balancing is inherently racy and statistical,
-         * it could happen in theory.
-         */
-        if (unlikely(busiest == this_rq)) {
-                WARN_ON(1);
-                goto out_balanced;
-        }
        schedstat_add(sd, lb_imbalance[idle], imbalance);
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 */
                double_lock_balance(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                                imbalance, sd, idle);
+                                                imbalance, sd, idle,
+                                                &all_pinned);
                spin_unlock(&busiest->lock);
+                /* All tasks on this runqueue were pinned by CPU affinity */
+                if (unlikely(all_pinned))
+                        goto out_balanced;
        }
        spin_unlock(&this_rq->lock);
        if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                sd->nr_balance_failed++;
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                        int wake = 0;
                        spin_lock(&busiest->lock);
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
-                                wake = 1;
+                                active_balance = 1;
                        }
                        spin_unlock(&busiest->lock);
-                        if (wake)
+                        if (active_balance)
                                wake_up_process(busiest->migration_thread);
                        /*
                         * We've kicked active balancing, reset the failure
                         * counter.
                         */
-                        sd->nr_balance_failed = sd->cache_nice_tries;
+                        sd->nr_balance_failed = sd->cache_nice_tries+1;
                }
+        } else
-                /*
-                 * We were unbalanced, but unsuccessful in move_tasks(),
-                 * so bump the balance_interval to lessen the lock contention.
-                 */
-                if (sd->balance_interval < sd->max_interval)
-                        sd->balance_interval++;
-        } else {
                sd->nr_balance_failed = 0;
+        if (likely(!active_balance)) {
                /* We were unbalanced, so reset the balancing interval */
                sd->balance_interval = sd->min_interval;
+        } else {
+                /*
+                 * If we've begun active balancing, start to back off. This
+                 * case may not be covered by the all_pinned logic if there
+                 * is only 1 task on the busy runqueue (because we don't call
+                 * move_tasks).
+                 */
+                if (sd->balance_interval < sd->max_interval)
+                        sd->balance_interval *= 2;
        }
        return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
+        sd->nr_balance_failed = 0;
        /* tune up the balancing interval */
-        if (sd->balance_interval < sd->max_interval)
+        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
        return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
        if (!group) {
-                schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-                goto out;
+                goto out_balanced;
        }
        busiest = find_busiest_queue(group);
-        if (!busiest || busiest == this_rq) {
+        if (!busiest) {
-                schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-                goto out;
+                goto out_balanced;
        }
+        BUG_ON(busiest == this_rq);
        /* Attempt to move tasks */
        double_lock_balance(this_rq, busiest);
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
        nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                        imbalance, sd, NEWLY_IDLE);
+                                        imbalance, sd, NEWLY_IDLE, NULL);
        if (!nr_moved)
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+        else
+                sd->nr_balance_failed = 0;
        spin_unlock(&busiest->lock);
-out:
        return nr_moved;
+out_balanced:
+        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+        sd->nr_balance_failed = 0;
+        return 0;
 }
 /*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
        struct sched_domain *sd;
-        struct sched_group *cpu_group;
        runqueue_t *target_rq;
-        cpumask_t visited_cpus;
+        int target_cpu = busiest_rq->push_cpu;
-        int cpu;
+        if (busiest_rq->nr_running <= 1)
+                /* no task to move */
+                return;
+        target_rq = cpu_rq(target_cpu);
        /*
-         * Search for suitable CPUs to push tasks to in successively higher
+         * This condition is "impossible", if it occurs
-         * domains with SD_LOAD_BALANCE set.
+         * we need to fix it.  Originally reported by
+         * Bjorn Helgaas on a 128-cpu setup.
         */
-        visited_cpus = CPU_MASK_NONE;
+        BUG_ON(busiest_rq == target_rq);
-        for_each_domain(busiest_cpu, sd) {
-                if (!(sd->flags & SD_LOAD_BALANCE))
-                        /* no more domains to search */
-                        break;
-                schedstat_inc(sd, alb_cnt);
+        /* move a task from busiest_rq to target_rq */
+        double_lock_balance(busiest_rq, target_rq);
-                cpu_group = sd->groups;
+        /* Search for an sd spanning us and the target CPU. */
-                do {
+        for_each_domain(target_cpu, sd)
-                        for_each_cpu_mask(cpu, cpu_group->cpumask) {
+                if ((sd->flags & SD_LOAD_BALANCE) &&
-                                if (busiest_rq->nr_running <= 1)
+                        cpu_isset(busiest_cpu, sd->span))
-                                        /* no more tasks left to move */
+                                break;
-                                        return;
-                                if (cpu_isset(cpu, visited_cpus))
+        if (unlikely(sd == NULL))
-                                        continue;
+                goto out;
-                                cpu_set(cpu, visited_cpus);
-                                if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
+        schedstat_inc(sd, alb_cnt);
-                                        continue;
+        if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
-                                target_rq = cpu_rq(cpu);
+                schedstat_inc(sd, alb_pushed);
-                                /*
+        else
-                                 * This condition is "impossible", if it occurs
+                schedstat_inc(sd, alb_failed);
-                                 * we need to fix it.  Originally reported by
+out:
-                                 * Bjorn Helgaas on a 128-cpu setup.
+        spin_unlock(&target_rq->lock);
-                                 */
-                                BUG_ON(busiest_rq == target_rq);
-                                /* move a task from busiest_rq to target_rq */
-                                double_lock_balance(busiest_rq, target_rq);
-                                if (move_tasks(target_rq, cpu, busiest_rq,
-                                                1, sd, SCHED_IDLE)) {
-                                        schedstat_inc(sd, alb_pushed);
-                                } else {
-                                        schedstat_inc(sd, alb_failed);
-                                }
-                                spin_unlock(&target_rq->lock);
-                        }
-                        cpu_group = cpu_group->next;
-                } while (cpu_group != sd->groups);
-        }
 }
 /*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
        unsigned long old_load, this_load;
        unsigned long j = jiffies + CPU_OFFSET(this_cpu);
        struct sched_domain *sd;
+        int i;
-        /* Update our load */
-        old_load = this_rq->cpu_load;
        this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-        /*
+        /* Update our load */
-         * Round up the averaging division if load is increasing. This
+        for (i = 0; i < 3; i++) {
-         * prevents us from getting stuck on 9 if the load is 10, for
+                unsigned long new_load = this_load;
-         * example.
+                int scale = 1 << i;
-         */
+                old_load = this_rq->cpu_load[i];
-        if (this_load > old_load)
+                /*
-                old_load++;
+                 * Round up the averaging division if load is increasing. This
-        this_rq->cpu_load = (old_load + this_load) / 2;
+                 * prevents us from getting stuck on 9 if the load is 10, for
+                 * example.
+                 */
+                if (new_load > old_load)
+                        new_load += scale-1;
+                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+        }
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
 #ifdef CONFIG_SCHED_SMT
 static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-        struct sched_domain *sd = this_rq->sd;
+        struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
        int i;
-        if (!(sd->flags & SD_SHARE_CPUPOWER))
+        for_each_domain(this_cpu, tmp)
+                if (tmp->flags & SD_SHARE_CPUPOWER)
+                        sd = tmp;
+        if (!sd)
                return;
        /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-        struct sched_domain *sd = this_rq->sd;
+        struct sched_domain *tmp, *sd = NULL;
        cpumask_t sibling_map;
        prio_array_t *array;
        int ret = 0, i;
        task_t *p;
-        if (!(sd->flags & SD_SHARE_CPUPOWER))
+        for_each_domain(this_cpu, tmp)
+                if (tmp->flags & SD_SHARE_CPUPOWER)
+                        sd = tmp;
+        if (!sd)
                return 0;
        /*
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
        struct list_head *queue;
        unsigned long long now;
        unsigned long run_time;
-        int cpu, idx;
+        int cpu, idx, new_prio;
        /*
         * Test if we are atomic.  Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
                array = next->array;
-                dequeue_task(next, array);
+                new_prio = recalc_task_prio(next, next->timestamp + delta);
-                recalc_task_prio(next, next->timestamp + delta);
-                enqueue_task(next, array);
+                if (unlikely(next->prio != new_prio)) {
+                        dequeue_task(next, array);
+                        next->prio = new_prio;
+                        enqueue_task(next, array);
+                } else
+                        requeue_task(next, array);
        }
        next->activated = 0;
 switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
                rq->curr = next;
                ++*switch_count;
-                prepare_arch_switch(rq, next);
+                prepare_task_switch(rq, next);
                prev = context_switch(rq, prev, next);
                barrier();
+                /*
-                finish_task_switch(prev);
+                 * this_rq must be evaluated again because prev may have moved
+                 * CPUs since it called schedule(), thus the 'rq' on its stack
+                 * frame will be invalid.
+                 */
+                finish_task_switch(this_rq(), prev);
        } else
                spin_unlock_irq(&rq->lock);
@@ -3384,13 +3531,24 @@ recheck:
        if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
                return -EINVAL;
-        if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
+        /*
-            param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
+         * Allow unprivileged RT tasks to decrease priority:
-            !capable(CAP_SYS_NICE))
+         */
-                return -EPERM;
+        if (!capable(CAP_SYS_NICE)) {
-        if ((current->euid != p->euid) && (current->euid != p->uid) &&
+                /* can't change policy */
-            !capable(CAP_SYS_NICE))
+                if (policy != p->policy)
-                return -EPERM;
+                        return -EPERM;
+                /* can't increase priority */
+                if (policy != SCHED_NORMAL &&
+                    param->sched_priority > p->rt_priority &&
+                    param->sched_priority >
+                                p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+                        return -EPERM;
+                /* can't change other user's priorities */
+                if ((current->euid != p->euid) &&
+                    (current->euid != p->uid))
+                        return -EPERM;
+        }
        retval = security_task_setscheduler(p, policy, param);
        if (retval)
@@ -4030,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
        spin_lock_irqsave(&rq->lock, flags);
        rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+        idle->oncpu = 1;
+#endif
        set_tsk_need_resched(idle);
        spin_unlock_irqrestore(&rq->lock, flags);
@@ -4174,8 +4335,7 @@ static int migration_thread(void * data)
                struct list_head *head;
                migration_req_t *req;
-                if (current->flags & PF_FREEZE)
+                try_to_freeze();
-                        refrigerator(PF_FREEZE);
                spin_lock_irq(&rq->lock);
@@ -4200,17 +4360,9 @@ static int migration_thread(void * data)
                req = list_entry(head->next, migration_req_t, list);
                list_del_init(head->next);
-                if (req->type == REQ_MOVE_TASK) {
+                spin_unlock(&rq->lock);
-                        spin_unlock(&rq->lock);
+                __migrate_task(req->task, cpu, req->dest_cpu);
-                        __migrate_task(req->task, cpu, req->dest_cpu);
+                local_irq_enable();
-                        local_irq_enable();
-                } else if (req->type == REQ_SET_DOMAIN) {
-                        rq->sd = req->sd;
-                        spin_unlock_irq(&rq->lock);
-                } else {
-                        spin_unlock_irq(&rq->lock);
-                        WARN_ON(1);
-                }
                complete(&req->done);
        }
@@ -4441,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
                        migration_req_t *req;
                        req = list_entry(rq->migration_queue.next,
                                         migration_req_t, list);
-                        BUG_ON(req->type != REQ_MOVE_TASK);
                        list_del_init(&req->list);
                        complete(&req->done);
                }
@@ -4472,12 +4623,17 @@ int __init migration_init(void)
 #endif
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
+        if (!sd) {
+                printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+                return;
+        }
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
        do {
@@ -4560,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
+static int sd_degenerate(struct sched_domain *sd)
+{
+        if (cpus_weight(sd->span) == 1)
+                return 1;
+        /* Following flags need at least 2 groups */
+        if (sd->flags & (SD_LOAD_BALANCE |
+                         SD_BALANCE_NEWIDLE |
+                         SD_BALANCE_FORK |
+                         SD_BALANCE_EXEC)) {
+                if (sd->groups != sd->groups->next)
+                        return 0;
+        }
+        /* Following flags don't use groups */
+        if (sd->flags & (SD_WAKE_IDLE |
+                         SD_WAKE_AFFINE |
+                         SD_WAKE_BALANCE))
+                return 0;
+        return 1;
+}
+static int sd_parent_degenerate(struct sched_domain *sd,
+                                                struct sched_domain *parent)
+{
+        unsigned long cflags = sd->flags, pflags = parent->flags;
+        if (sd_degenerate(parent))
+                return 1;
+        if (!cpus_equal(sd->span, parent->span))
+                return 0;
+        /* Does parent contain flags not in child? */
+        /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+        if (cflags & SD_WAKE_AFFINE)
+                pflags &= ~SD_WAKE_BALANCE;
+        /* Flags needing groups don't count if only 1 group in parent */
+        if (parent->groups == parent->groups->next) {
+                pflags &= ~(SD_LOAD_BALANCE |
+                                SD_BALANCE_NEWIDLE |
+                                SD_BALANCE_FORK |
+                                SD_BALANCE_EXEC);
+        }
+        if (~cflags & pflags)
+                return 0;
+        return 1;
+}
 /*
 * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
 * hold the hotplug lock.
 */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
-        migration_req_t req;
-        unsigned long flags;
        runqueue_t *rq = cpu_rq(cpu);
-        int local = 1;
+        struct sched_domain *tmp;
-        sched_domain_debug(sd, cpu);
-        spin_lock_irqsave(&rq->lock, flags);
+        /* Remove the sched domains which do not contribute to scheduling. */
+        for (tmp = sd; tmp; tmp = tmp->parent) {
-        if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+                struct sched_domain *parent = tmp->parent;
-                rq->sd = sd;
+                if (!parent)
-        } else {
+                        break;
-                init_completion(&req.done);
+                if (sd_parent_degenerate(tmp, parent))
-                req.type = REQ_SET_DOMAIN;
+                        tmp->parent = parent->parent;
-                req.sd = sd;
-                list_add(&req.list, &rq->migration_queue);
-                local = 0;
        }
-        spin_unlock_irqrestore(&rq->lock, flags);
+        if (sd && sd_degenerate(sd))
+                sd = sd->parent;
-        if (!local) {
+        sched_domain_debug(sd, cpu);
-                wake_up_process(rq->migration_thread);
-                wait_for_completion(&req.done);
+        rcu_assign_pointer(rq->sd, sd);
-        }
 }
 /* cpus with isolated domains */
@@ -4622,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
                        cpumask_t span, int (*group_fn)(int cpu))
 {
        struct sched_group *first = NULL, *last = NULL;
@@ -4658,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
        return cpu;
 }
@@ -4672,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
        return first_cpu(cpu_sibling_map[cpu]);
@@ -4685,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
        return cpu_to_node(cpu);
 }
@@ -4716,39 +4917,28 @@ static void check_sibling_maps(void)
 #endif
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
 */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
-        cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-        check_sibling_maps();
-#endif
-        /*
-         * Setup mask for cpus without special case scheduling requirements.
-         * For now this just excludes isolated cpus, but could be used to
-         * exclude other special cases in the future.
-         */
-        cpus_complement(cpu_default_map, cpu_isolated_map);
-        cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
        /*
-         * Set up domains. Isolated domains just stay on the dummy domain.
+         * Set up domains for cpus specified by the cpu_map.
         */
-        for_each_cpu_mask(i, cpu_default_map) {
+        for_each_cpu_mask(i, *cpu_map) {
                int group;
                struct sched_domain *sd = NULL, *p;
                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-                cpus_and(nodemask, nodemask, cpu_default_map);
+                cpus_and(nodemask, nodemask, *cpu_map);
 #ifdef CONFIG_NUMA
                sd = &per_cpu(node_domains, i);
                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-                sd->span = cpu_default_map;
+                sd->span = *cpu_map;
                sd->groups = &sched_group_nodes[group];
 #endif
@@ -4766,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
                group = cpu_to_cpu_group(i);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
-                cpus_and(sd->span, sd->span, cpu_default_map);
+                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                sd->groups = &sched_group_cpus[group];
 #endif
@@ -4776,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
        /* Set up CPU (sibling) groups */
        for_each_online_cpu(i) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
-                cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+                cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                if (i != first_cpu(this_sibling_map))
                        continue;
@@ -4789,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
-                cpus_and(nodemask, nodemask, cpu_default_map);
+                cpus_and(nodemask, nodemask, *cpu_map);
                if (cpus_empty(nodemask))
                        continue;
@@ -4799,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-        init_sched_build_groups(sched_group_nodes, cpu_default_map,
+        init_sched_build_groups(sched_group_nodes, *cpu_map,
                                        &cpu_to_node_group);
 #endif
        /* Calculate CPU power for physical packages and nodes */
-        for_each_cpu_mask(i, cpu_default_map) {
+        for_each_cpu_mask(i, *cpu_map) {
                int power;
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -4828,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
        }
        /* Attach the domains */
-        for_each_online_cpu(i) {
+        for_each_cpu_mask(i, *cpu_map) {
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
                sd = &per_cpu(cpu_domains, i);
@@ -4838,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
                cpu_attach_domain(sd, i);
        }
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+        cpumask_t cpu_default_map;
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+        check_sibling_maps();
+#endif
+        /*
+         * Setup mask for cpus without special case scheduling requirements.
+         * For now this just excludes isolated cpus, but could be used to
+         * exclude other special cases in the future.
+         */
+        cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+        build_sched_domains(&cpu_default_map);
+}
-#ifdef CONFIG_HOTPLUG_CPU
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-static void __devinit arch_destroy_sched_domains(void)
 {
        /* Do nothing: everything is statically allocated. */
 }
-#endif
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 /*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
+ * Detach sched domains from a group of cpus specified in cpu_map
- * it is initialized to zero, so all balancing flags are cleared which is
+ * These cpus will now be attached to the NULL domain
- * what we want.
 */
-static struct sched_domain sched_domain_dummy;
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+        int i;
+        for_each_cpu_mask(i, *cpu_map)
+                cpu_attach_domain(NULL, i);
+        synchronize_sched();
+        arch_destroy_sched_domains(cpu_map);
+}
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+        cpumask_t change_map;
+        cpus_and(*partition1, *partition1, cpu_online_map);
+        cpus_and(*partition2, *partition2, cpu_online_map);
+        cpus_or(change_map, *partition1, *partition2);
+        /* Detach sched domains from all of the affected cpus */
+        detach_destroy_domains(&change_map);
+        if (!cpus_empty(*partition1))
+                build_sched_domains(partition1);
+        if (!cpus_empty(*partition2))
+                build_sched_domains(partition2);
+}
 #ifdef CONFIG_HOTPLUG_CPU
 /*
 * Force a reinitialization of the sched domains hierarchy.  The domains
 * and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
 * which will prevent rebalancing while the sched domains are recalculated.
 */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
-        int i;
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_DOWN_PREPARE:
-                for_each_online_cpu(i)
+                detach_destroy_domains(&cpu_online_map);
-                        cpu_attach_domain(&sched_domain_dummy, i);
-                arch_destroy_sched_domains();
                return NOTIFY_OK;
        case CPU_UP_CANCELED:
@@ -4888,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
        }
        /* The hotplug lock is already held by cpu_up/cpu_down */
-        arch_init_sched_domains();
+        arch_init_sched_domains(&cpu_online_map);
        return NOTIFY_OK;
 }
@@ -4897,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
        lock_cpu_hotplug();
-        arch_init_sched_domains();
+        arch_init_sched_domains(&cpu_online_map);
        unlock_cpu_hotplug();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
@@ -4927,13 +5161,15 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
+                rq->nr_running = 0;
                rq->active = rq->arrays;
                rq->expired = rq->arrays + 1;
                rq->best_expired_prio = MAX_PRIO;
 #ifdef CONFIG_SMP
-                rq->sd = &sched_domain_dummy;
+                rq->sd = NULL;
-                rq->cpu_load = 0;
+                for (j = 1; j < 3; j++)
+                        rq->cpu_load[j] = 0;
                rq->active_balance = 0;
                rq->push_cpu = 0;
                rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index d1258729a5f9..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,7 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 fastcall void recalc_sigpending_tsk(struct task_struct *t)
 {
        if (t->signal->group_stop_count > 0 ||
-            (t->flags & PF_FREEZE) ||
+            (freezing(t)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked))
                set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -2231,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
                        current->state = TASK_INTERRUPTIBLE;
                        timeout = schedule_timeout(timeout);
-                        if (current->flags & PF_FREEZE)
+                        try_to_freeze();
-                                refrigerator(PF_FREEZE);
                        spin_lock_irq(&current->sighand->siglock);
                        sig = dequeue_signal(current, &these, &info);
                        current->blocked = current->real_blocked;
diff --git a/kernel/sys.c b/kernel/sys.c
index da24bc1292db..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/key.h>
@@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
        case LINUX_REBOOT_CMD_HALT:
                notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
                system_state = SYSTEM_HALT;
+                device_suspend(PMSG_SUSPEND);
                device_shutdown();
                printk(KERN_EMERG "System halted.\n");
                machine_halt();
@@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
        case LINUX_REBOOT_CMD_POWER_OFF:
                notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
                system_state = SYSTEM_POWER_OFF;
+                device_suspend(PMSG_SUSPEND);
                device_shutdown();
                printk(KERN_EMERG "Power down.\n");
                machine_power_off();
@@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
                notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
                system_state = SYSTEM_RESTART;
+                device_suspend(PMSG_FREEZE);
                device_shutdown();
                printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
                machine_restart(buffer);
                break;
+#ifdef CONFIG_KEXEC
+        case LINUX_REBOOT_CMD_KEXEC:
+        {
+                struct kimage *image;
+                image = xchg(&kexec_image, 0);
+                if (!image) {
+                        unlock_kernel();
+                        return -EINVAL;
+                }
+                notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+                system_state = SYSTEM_RESTART;
+                device_shutdown();
+                printk(KERN_EMERG "Starting new kernel\n");
+                machine_shutdown();
+                machine_kexec(image);
+                break;
+        }
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
        case LINUX_REBOOT_CMD_SW_SUSPEND:
                {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
 cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 24a4d12d5aa9..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1000,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
                int error = parse_table(name, nlen, oldval, oldlenp, 
                                        newval, newlen, head->ctl_table,
                                        &context);
-                if (context)
+                kfree(context);
-                        kfree(context);
                if (error != -ENOTDIR)
                        return error;
                tmp = tmp->next;
diff --git a/kernel/timer.c b/kernel/timer.c
index 51ff917c9590..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1597,7 +1597,7 @@ void msleep(unsigned int msecs)
 EXPORT_SYMBOL(msleep);
 /**
- * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * msleep_interruptible - sleep waiting for signals
 * @msecs: Time in milliseconds to sleep for
 */
 unsigned long msleep_interruptible(unsigned int msecs)