aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt65
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c89
-rw-r--r--kernel/crash_dump.c52
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/kexec.c1063
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/disk.c35
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c89
-rw-r--r--kernel/power/swsusp.c93
-rw-r--r--kernel/printk.c3
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c1048
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/sys.c23
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/timer.c2
24 files changed, 2103 insertions, 600 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
1
2choice
3 prompt "Preemption Model"
4 default PREEMPT_NONE
5
6config PREEMPT_NONE
7 bool "No Forced Preemption (Server)"
8 help
9 This is the traditional Linux preemption model, geared towards
10 throughput. It will still provide good latencies most of the
11 time, but there are no guarantees and occasional longer delays
12 are possible.
13
14 Select this option if you are building a kernel for a server or
15 scientific/computation system, or if you want to maximize the
16 raw processing power of the kernel, irrespective of scheduling
17 latencies.
18
19config PREEMPT_VOLUNTARY
20 bool "Voluntary Kernel Preemption (Desktop)"
21 help
22 This option reduces the latency of the kernel by adding more
23 "explicit preemption points" to the kernel code. These new
24 preemption points have been selected to reduce the maximum
25 latency of rescheduling, providing faster application reactions,
26 at the cost of slighly lower throughput.
27
28 This allows reaction to interactive events by allowing a
29 low priority process to voluntarily preempt itself even if it
30 is in kernel mode executing a system call. This allows
31 applications to run more 'smoothly' even when the system is
32 under load.
33
34 Select this if you are building a kernel for a desktop system.
35
36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 help
39 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section)
41 preemptible. This allows reaction to interactive events by
42 permitting a low priority process to be preempted involuntarily
43 even if it is in kernel mode executing a system call and would
44 otherwise not be about to reach a natural preemption point.
45 This allows applications to run more 'smoothly' even when the
46 system is under load, at the cost of slighly lower throughput
47 and a slight runtime overhead to kernel code.
48
49 Select this if you are building a kernel for a desktop or
50 embedded system with latency requirements in the milliseconds
51 range.
52
53endchoice
54
55config PREEMPT_BKL
56 bool "Preempt The Big Kernel Lock"
57 depends on SMP || PREEMPT
58 default y
59 help
60 This option reduces the latency of the kernel by making the
61 big kernel lock preemptible.
62
63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/ 18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_KEXEC) += kexec.o
20obj-$(CONFIG_COMPAT) += compat.o 21obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o 22obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o 23obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
27obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
28obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
29obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
30obj-$(CONFIG_SECCOMP) += seccomp.o 32obj-$(CONFIG_SECCOMP) += seccomp.o
31 33
32ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 34ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
63{ 63{
64 int err; 64 int err;
65 65
66 /* Take offline: makes arch_cpu_down somewhat easier. */
67 cpu_clear(smp_processor_id(), cpu_online_map);
68
69 /* Ensure this CPU doesn't handle any more interrupts. */ 66 /* Ensure this CPU doesn't handle any more interrupts. */
70 err = __cpu_disable(); 67 err = __cpu_disable();
71 if (err < 0) 68 if (err < 0)
72 cpu_set(smp_processor_id(), cpu_online_map); 69 return err;
73 else
74 /* Force idle task to run as soon as we yield: it should
75 immediately notice cpu is offline and die quickly. */
76 sched_idle_next();
77 70
78 return err; 71 /* Force idle task to run as soon as we yield: it should
72 immediately notice cpu is offline and die quickly. */
73 sched_idle_next();
74 return 0;
79} 75}
80 76
81int cpu_down(unsigned int cpu) 77int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79dd929f4084..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
595 return 0; 595 return 0;
596} 596}
597 597
598/*
599 * For a given cpuset cur, partition the system as follows
600 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
601 * exclusive child cpusets
602 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
603 * exclusive child cpusets
604 * Build these two partitions by calling partition_sched_domains
605 *
606 * Call with cpuset_sem held. May nest a call to the
607 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
608 */
609static void update_cpu_domains(struct cpuset *cur)
610{
611 struct cpuset *c, *par = cur->parent;
612 cpumask_t pspan, cspan;
613
614 if (par == NULL || cpus_empty(cur->cpus_allowed))
615 return;
616
617 /*
618 * Get all cpus from parent's cpus_allowed not part of exclusive
619 * children
620 */
621 pspan = par->cpus_allowed;
622 list_for_each_entry(c, &par->children, sibling) {
623 if (is_cpu_exclusive(c))
624 cpus_andnot(pspan, pspan, c->cpus_allowed);
625 }
626 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
627 cpus_or(pspan, pspan, cur->cpus_allowed);
628 if (cpus_equal(pspan, cur->cpus_allowed))
629 return;
630 cspan = CPU_MASK_NONE;
631 } else {
632 if (cpus_empty(pspan))
633 return;
634 cspan = cur->cpus_allowed;
635 /*
636 * Get all cpus from current cpuset's cpus_allowed not part
637 * of exclusive children
638 */
639 list_for_each_entry(c, &cur->children, sibling) {
640 if (is_cpu_exclusive(c))
641 cpus_andnot(cspan, cspan, c->cpus_allowed);
642 }
643 }
644
645 lock_cpu_hotplug();
646 partition_sched_domains(&pspan, &cspan);
647 unlock_cpu_hotplug();
648}
649
598static int update_cpumask(struct cpuset *cs, char *buf) 650static int update_cpumask(struct cpuset *cs, char *buf)
599{ 651{
600 struct cpuset trialcs; 652 struct cpuset trialcs;
601 int retval; 653 int retval, cpus_unchanged;
602 654
603 trialcs = *cs; 655 trialcs = *cs;
604 retval = cpulist_parse(buf, trialcs.cpus_allowed); 656 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
608 if (cpus_empty(trialcs.cpus_allowed)) 660 if (cpus_empty(trialcs.cpus_allowed))
609 return -ENOSPC; 661 return -ENOSPC;
610 retval = validate_change(cs, &trialcs); 662 retval = validate_change(cs, &trialcs);
611 if (retval == 0) 663 if (retval < 0)
612 cs->cpus_allowed = trialcs.cpus_allowed; 664 return retval;
613 return retval; 665 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
666 cs->cpus_allowed = trialcs.cpus_allowed;
667 if (is_cpu_exclusive(cs) && !cpus_unchanged)
668 update_cpu_domains(cs);
669 return 0;
614} 670}
615 671
616static int update_nodemask(struct cpuset *cs, char *buf) 672static int update_nodemask(struct cpuset *cs, char *buf)
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
646{ 702{
647 int turning_on; 703 int turning_on;
648 struct cpuset trialcs; 704 struct cpuset trialcs;
649 int err; 705 int err, cpu_exclusive_changed;
650 706
651 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 707 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
652 708
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
657 clear_bit(bit, &trialcs.flags); 713 clear_bit(bit, &trialcs.flags);
658 714
659 err = validate_change(cs, &trialcs); 715 err = validate_change(cs, &trialcs);
660 if (err == 0) { 716 if (err < 0)
661 if (turning_on) 717 return err;
662 set_bit(bit, &cs->flags); 718 cpu_exclusive_changed =
663 else 719 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
664 clear_bit(bit, &cs->flags); 720 if (turning_on)
665 } 721 set_bit(bit, &cs->flags);
666 return err; 722 else
723 clear_bit(bit, &cs->flags);
724
725 if (cpu_exclusive_changed)
726 update_cpu_domains(cs);
727 return 0;
667} 728}
668 729
669static int attach_task(struct cpuset *cs, char *buf) 730static int attach_task(struct cpuset *cs, char *buf)
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1309 up(&cpuset_sem); 1370 up(&cpuset_sem);
1310 return -EBUSY; 1371 return -EBUSY;
1311 } 1372 }
1312 spin_lock(&cs->dentry->d_lock);
1313 parent = cs->parent; 1373 parent = cs->parent;
1314 set_bit(CS_REMOVED, &cs->flags); 1374 set_bit(CS_REMOVED, &cs->flags);
1375 if (is_cpu_exclusive(cs))
1376 update_cpu_domains(cs);
1315 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1377 list_del(&cs->sibling); /* delete my sibling from parent->children */
1316 if (list_empty(&parent->children)) 1378 if (list_empty(&parent->children))
1317 check_for_release(parent); 1379 check_for_release(parent);
1380 spin_lock(&cs->dentry->d_lock);
1318 d = dget(cs->dentry); 1381 d = dget(cs->dentry);
1319 cs->dentry = NULL; 1382 cs->dentry = NULL;
1320 spin_unlock(&d->d_lock); 1383 spin_unlock(&d->d_lock);
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..459ba49e376a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,52 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/*
22 * Copy a page from "oldmem". For this page, there is no pte mapped
23 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
24 */
25ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
26 size_t csize, unsigned long offset, int userbuf)
27{
28 void *page, *vaddr;
29
30 if (!csize)
31 return 0;
32
33 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
34 if (!page)
35 return -ENOMEM;
36
37 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
38 copy_page(page, vaddr);
39 kunmap_atomic(vaddr, KM_PTE0);
40
41 if (userbuf) {
42 if (copy_to_user(buf, (page + offset), csize)) {
43 kfree(page);
44 return -EFAULT;
45 }
46 } else {
47 memcpy(buf, (page + offset), csize);
48 }
49
50 kfree(page);
51 return csize;
52}
diff --git a/kernel/fork.c b/kernel/fork.c
index a28d11e10877..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1003,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
1003 p->pdeath_signal = 0; 1003 p->pdeath_signal = 0;
1004 p->exit_state = 0; 1004 p->exit_state = 0;
1005 1005
1006 /* Perform scheduler related setup */
1007 sched_fork(p);
1008
1009 /* 1006 /*
1010 * Ok, make it visible to the rest of the system. 1007 * Ok, make it visible to the rest of the system.
1011 * We dont wake it up yet. 1008 * We dont wake it up yet.
@@ -1014,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
1014 INIT_LIST_HEAD(&p->ptrace_children); 1011 INIT_LIST_HEAD(&p->ptrace_children);
1015 INIT_LIST_HEAD(&p->ptrace_list); 1012 INIT_LIST_HEAD(&p->ptrace_list);
1016 1013
1014 /* Perform scheduler related setup. Assign this task to a CPU. */
1015 sched_fork(p, clone_flags);
1016
1017 /* Need tasklist lock for parent etc handling! */ 1017 /* Need tasklist lock for parent etc handling! */
1018 write_lock_irq(&tasklist_lock); 1018 write_lock_irq(&tasklist_lock);
1019 1019
1020 /* 1020 /*
1021 * The task hasn't been attached yet, so cpus_allowed mask cannot 1021 * The task hasn't been attached yet, so its cpus_allowed mask will
1022 * have changed. The cpus_allowed mask of the parent may have 1022 * not be changed, nor will its assigned CPU.
1023 * changed after it was copied first time, and it may then move to 1023 *
1024 * another CPU - so we re-copy it here and set the child's CPU to 1024 * The cpus_allowed mask of the parent may have changed after it was
1025 * the parent's CPU. This avoids alot of nasty races. 1025 * copied first time - so re-copy it here, then check the child's CPU
1026 * to ensure it is on a valid CPU (and if not, just force it back to
1027 * parent's CPU). This avoids alot of nasty races.
1026 */ 1028 */
1027 p->cpus_allowed = current->cpus_allowed; 1029 p->cpus_allowed = current->cpus_allowed;
1028 set_task_cpu(p, smp_processor_id()); 1030 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
1031 set_task_cpu(p, smp_processor_id());
1029 1032
1030 /* 1033 /*
1031 * Check for pending SIGKILL! The new thread should not be allowed 1034 * Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..7843548cf2d9
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
21#include <linux/hardirq.h>
22
23#include <asm/page.h>
24#include <asm/uaccess.h>
25#include <asm/io.h>
26#include <asm/system.h>
27#include <asm/semaphore.h>
28
29/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = {
31 .name = "Crash kernel",
32 .start = 0,
33 .end = 0,
34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
35};
36
37int kexec_should_crash(struct task_struct *p)
38{
39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 return 1;
41 return 0;
42}
43
44/*
45 * When kexec transitions to the new kernel there is a one-to-one
46 * mapping between physical and virtual addresses. On processors
47 * where you can disable the MMU this is trivial, and easy. For
48 * others it is still a simple predictable page table to setup.
49 *
50 * In that environment kexec copies the new kernel to its final
51 * resting place. This means I can only support memory whose
52 * physical address can fit in an unsigned long. In particular
53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54 * If the assembly stub has more restrictive requirements
55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56 * defined more restrictively in <asm/kexec.h>.
57 *
58 * The code for the transition from the current kernel to the
59 * the new kernel is placed in the control_code_buffer, whose size
60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
61 * page of memory is necessary, but some architectures require more.
62 * Because this memory must be identity mapped in the transition from
63 * virtual to physical addresses it must live in the range
64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65 * modifiable.
66 *
67 * The assembly stub in the control code buffer is passed a linked list
68 * of descriptor pages detailing the source pages of the new kernel,
69 * and the destination addresses of those source pages. As this data
70 * structure is not used in the context of the current OS, it must
71 * be self-contained.
72 *
73 * The code has been made to work with highmem pages and will use a
74 * destination page in its final resting place (if it happens
75 * to allocate it). The end product of this is that most of the
76 * physical address space, and most of RAM can be used.
77 *
78 * Future directions include:
79 * - allocating a page table with the control code buffer identity
80 * mapped, to simplify machine_kexec and make kexec_on_panic more
81 * reliable.
82 */
83
84/*
85 * KIMAGE_NO_DEST is an impossible destination address..., for
86 * allocating pages whose destination address we do not care about.
87 */
88#define KIMAGE_NO_DEST (-1UL)
89
90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask,
94 unsigned long dest);
95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 unsigned long nr_segments,
98 struct kexec_segment __user *segments)
99{
100 size_t segment_bytes;
101 struct kimage *image;
102 unsigned long i;
103 int result;
104
105 /* Allocate a controlling structure */
106 result = -ENOMEM;
107 image = kmalloc(sizeof(*image), GFP_KERNEL);
108 if (!image)
109 goto out;
110
111 memset(image, 0, sizeof(*image));
112 image->head = 0;
113 image->entry = &image->head;
114 image->last_entry = &image->head;
115 image->control_page = ~0; /* By default this does not apply */
116 image->start = entry;
117 image->type = KEXEC_TYPE_DEFAULT;
118
119 /* Initialize the list of control pages */
120 INIT_LIST_HEAD(&image->control_pages);
121
122 /* Initialize the list of destination pages */
123 INIT_LIST_HEAD(&image->dest_pages);
124
125 /* Initialize the list of unuseable pages */
126 INIT_LIST_HEAD(&image->unuseable_pages);
127
128 /* Read in the segments */
129 image->nr_segments = nr_segments;
130 segment_bytes = nr_segments * sizeof(*segments);
131 result = copy_from_user(image->segment, segments, segment_bytes);
132 if (result)
133 goto out;
134
135 /*
136 * Verify we have good destination addresses. The caller is
137 * responsible for making certain we don't attempt to load
138 * the new image into invalid or reserved areas of RAM. This
139 * just verifies it is an address we can use.
140 *
141 * Since the kernel does everything in page size chunks ensure
142 * the destination addreses are page aligned. Too many
143 * special cases crop of when we don't do this. The most
144 * insidious is getting overlapping destination addresses
145 * simply because addresses are changed to page size
146 * granularity.
147 */
148 result = -EADDRNOTAVAIL;
149 for (i = 0; i < nr_segments; i++) {
150 unsigned long mstart, mend;
151
152 mstart = image->segment[i].mem;
153 mend = mstart + image->segment[i].memsz;
154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 goto out;
156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 goto out;
158 }
159
160 /* Verify our destination addresses do not overlap.
161 * If we alloed overlapping destination addresses
162 * through very weird things can happen with no
163 * easy explanation as one segment stops on another.
164 */
165 result = -EINVAL;
166 for (i = 0; i < nr_segments; i++) {
167 unsigned long mstart, mend;
168 unsigned long j;
169
170 mstart = image->segment[i].mem;
171 mend = mstart + image->segment[i].memsz;
172 for (j = 0; j < i; j++) {
173 unsigned long pstart, pend;
174 pstart = image->segment[j].mem;
175 pend = pstart + image->segment[j].memsz;
176 /* Do the segments overlap ? */
177 if ((mend > pstart) && (mstart < pend))
178 goto out;
179 }
180 }
181
182 /* Ensure our buffer sizes are strictly less than
183 * our memory sizes. This should always be the case,
184 * and it is easier to check up front than to be surprised
185 * later on.
186 */
187 result = -EINVAL;
188 for (i = 0; i < nr_segments; i++) {
189 if (image->segment[i].bufsz > image->segment[i].memsz)
190 goto out;
191 }
192
193 result = 0;
194out:
195 if (result == 0)
196 *rimage = image;
197 else
198 kfree(image);
199
200 return result;
201
202}
203
204static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 unsigned long nr_segments,
206 struct kexec_segment __user *segments)
207{
208 int result;
209 struct kimage *image;
210
211 /* Allocate and initialize a controlling structure */
212 image = NULL;
213 result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 if (result)
215 goto out;
216
217 *rimage = image;
218
219 /*
220 * Find a location for the control code buffer, and add it
221 * the vector of segments so that it's pages will also be
222 * counted as destination pages.
223 */
224 result = -ENOMEM;
225 image->control_code_page = kimage_alloc_control_pages(image,
226 get_order(KEXEC_CONTROL_CODE_SIZE));
227 if (!image->control_code_page) {
228 printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 goto out;
230 }
231
232 result = 0;
233 out:
234 if (result == 0)
235 *rimage = image;
236 else
237 kfree(image);
238
239 return result;
240}
241
242static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 unsigned long nr_segments,
244 struct kexec_segment *segments)
245{
246 int result;
247 struct kimage *image;
248 unsigned long i;
249
250 image = NULL;
251 /* Verify we have a valid entry point */
252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 result = -EADDRNOTAVAIL;
254 goto out;
255 }
256
257 /* Allocate and initialize a controlling structure */
258 result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 if (result)
260 goto out;
261
262 /* Enable the special crash kernel control page
263 * allocation policy.
264 */
265 image->control_page = crashk_res.start;
266 image->type = KEXEC_TYPE_CRASH;
267
268 /*
269 * Verify we have good destination addresses. Normally
270 * the caller is responsible for making certain we don't
271 * attempt to load the new image into invalid or reserved
272 * areas of RAM. But crash kernels are preloaded into a
273 * reserved area of ram. We must ensure the addresses
274 * are in the reserved area otherwise preloading the
275 * kernel could corrupt things.
276 */
277 result = -EADDRNOTAVAIL;
278 for (i = 0; i < nr_segments; i++) {
279 unsigned long mstart, mend;
280
281 mstart = image->segment[i].mem;
282 mend = mstart + image->segment[i].memsz - 1;
283 /* Ensure we are within the crash kernel limits */
284 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 goto out;
286 }
287
288 /*
289 * Find a location for the control code buffer, and add
290 * the vector of segments so that it's pages will also be
291 * counted as destination pages.
292 */
293 result = -ENOMEM;
294 image->control_code_page = kimage_alloc_control_pages(image,
295 get_order(KEXEC_CONTROL_CODE_SIZE));
296 if (!image->control_code_page) {
297 printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 goto out;
299 }
300
301 result = 0;
302out:
303 if (result == 0)
304 *rimage = image;
305 else
306 kfree(image);
307
308 return result;
309}
310
311static int kimage_is_destination_range(struct kimage *image,
312 unsigned long start,
313 unsigned long end)
314{
315 unsigned long i;
316
317 for (i = 0; i < image->nr_segments; i++) {
318 unsigned long mstart, mend;
319
320 mstart = image->segment[i].mem;
321 mend = mstart + image->segment[i].memsz;
322 if ((end > mstart) && (start < mend))
323 return 1;
324 }
325
326 return 0;
327}
328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask,
330 unsigned int order)
331{
332 struct page *pages;
333
334 pages = alloc_pages(gfp_mask, order);
335 if (pages) {
336 unsigned int count, i;
337 pages->mapping = NULL;
338 pages->private = order;
339 count = 1 << order;
340 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i);
342 }
343
344 return pages;
345}
346
347static void kimage_free_pages(struct page *page)
348{
349 unsigned int order, count, i;
350
351 order = page->private;
352 count = 1 << order;
353 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i);
355 __free_pages(page, order);
356}
357
358static void kimage_free_page_list(struct list_head *list)
359{
360 struct list_head *pos, *next;
361
362 list_for_each_safe(pos, next, list) {
363 struct page *page;
364
365 page = list_entry(pos, struct page, lru);
366 list_del(&page->lru);
367 kimage_free_pages(page);
368 }
369}
370
371static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
372 unsigned int order)
373{
374 /* Control pages are special, they are the intermediaries
375 * that are needed while we copy the rest of the pages
376 * to their final resting place. As such they must
377 * not conflict with either the destination addresses
378 * or memory the kernel is already using.
379 *
380 * The only case where we really need more than one of
381 * these are for architectures where we cannot disable
382 * the MMU and must instead generate an identity mapped
383 * page table for all of the memory.
384 *
385 * At worst this runs in O(N) of the image size.
386 */
387 struct list_head extra_pages;
388 struct page *pages;
389 unsigned int count;
390
391 count = 1 << order;
392 INIT_LIST_HEAD(&extra_pages);
393
394 /* Loop while I can allocate a page and the page allocated
395 * is a destination page.
396 */
397 do {
398 unsigned long pfn, epfn, addr, eaddr;
399
400 pages = kimage_alloc_pages(GFP_KERNEL, order);
401 if (!pages)
402 break;
403 pfn = page_to_pfn(pages);
404 epfn = pfn + count;
405 addr = pfn << PAGE_SHIFT;
406 eaddr = epfn << PAGE_SHIFT;
407 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
408 kimage_is_destination_range(image, addr, eaddr)) {
409 list_add(&pages->lru, &extra_pages);
410 pages = NULL;
411 }
412 } while (!pages);
413
414 if (pages) {
415 /* Remember the allocated page... */
416 list_add(&pages->lru, &image->control_pages);
417
418 /* Because the page is already in it's destination
419 * location we will never allocate another page at
420 * that address. Therefore kimage_alloc_pages
421 * will not return it (again) and we don't need
422 * to give it an entry in image->segment[].
423 */
424 }
425 /* Deal with the destination pages I have inadvertently allocated.
426 *
427 * Ideally I would convert multi-page allocations into single
428 * page allocations, and add everyting to image->dest_pages.
429 *
430 * For now it is simpler to just free the pages.
431 */
432 kimage_free_page_list(&extra_pages);
433
434 return pages;
435}
436
437static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
438 unsigned int order)
439{
440 /* Control pages are special, they are the intermediaries
441 * that are needed while we copy the rest of the pages
442 * to their final resting place. As such they must
443 * not conflict with either the destination addresses
444 * or memory the kernel is already using.
445 *
446 * Control pages are also the only pags we must allocate
447 * when loading a crash kernel. All of the other pages
448 * are specified by the segments and we just memcpy
449 * into them directly.
450 *
451 * The only case where we really need more than one of
452 * these are for architectures where we cannot disable
453 * the MMU and must instead generate an identity mapped
454 * page table for all of the memory.
455 *
456 * Given the low demand this implements a very simple
457 * allocator that finds the first hole of the appropriate
458 * size in the reserved memory region, and allocates all
459 * of the memory up to and including the hole.
460 */
461 unsigned long hole_start, hole_end, size;
462 struct page *pages;
463
464 pages = NULL;
465 size = (1 << order) << PAGE_SHIFT;
466 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
467 hole_end = hole_start + size - 1;
468 while (hole_end <= crashk_res.end) {
469 unsigned long i;
470
471 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
472 break;
473 if (hole_end > crashk_res.end)
474 break;
475 /* See if I overlap any of the segments */
476 for (i = 0; i < image->nr_segments; i++) {
477 unsigned long mstart, mend;
478
479 mstart = image->segment[i].mem;
480 mend = mstart + image->segment[i].memsz - 1;
481 if ((hole_end >= mstart) && (hole_start <= mend)) {
482 /* Advance the hole to the end of the segment */
483 hole_start = (mend + (size - 1)) & ~(size - 1);
484 hole_end = hole_start + size - 1;
485 break;
486 }
487 }
488 /* If I don't overlap any segments I have found my hole! */
489 if (i == image->nr_segments) {
490 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
491 break;
492 }
493 }
494 if (pages)
495 image->control_page = hole_end;
496
497 return pages;
498}
499
500
501struct page *kimage_alloc_control_pages(struct kimage *image,
502 unsigned int order)
503{
504 struct page *pages = NULL;
505
506 switch (image->type) {
507 case KEXEC_TYPE_DEFAULT:
508 pages = kimage_alloc_normal_control_pages(image, order);
509 break;
510 case KEXEC_TYPE_CRASH:
511 pages = kimage_alloc_crash_control_pages(image, order);
512 break;
513 }
514
515 return pages;
516}
517
518static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
519{
520 if (*image->entry != 0)
521 image->entry++;
522
523 if (image->entry == image->last_entry) {
524 kimage_entry_t *ind_page;
525 struct page *page;
526
527 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
528 if (!page)
529 return -ENOMEM;
530
531 ind_page = page_address(page);
532 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
533 image->entry = ind_page;
534 image->last_entry = ind_page +
535 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
536 }
537 *image->entry = entry;
538 image->entry++;
539 *image->entry = 0;
540
541 return 0;
542}
543
544static int kimage_set_destination(struct kimage *image,
545 unsigned long destination)
546{
547 int result;
548
549 destination &= PAGE_MASK;
550 result = kimage_add_entry(image, destination | IND_DESTINATION);
551 if (result == 0)
552 image->destination = destination;
553
554 return result;
555}
556
557
558static int kimage_add_page(struct kimage *image, unsigned long page)
559{
560 int result;
561
562 page &= PAGE_MASK;
563 result = kimage_add_entry(image, page | IND_SOURCE);
564 if (result == 0)
565 image->destination += PAGE_SIZE;
566
567 return result;
568}
569
570
571static void kimage_free_extra_pages(struct kimage *image)
572{
573 /* Walk through and free any extra destination pages I may have */
574 kimage_free_page_list(&image->dest_pages);
575
576 /* Walk through and free any unuseable pages I have cached */
577 kimage_free_page_list(&image->unuseable_pages);
578
579}
580static int kimage_terminate(struct kimage *image)
581{
582 if (*image->entry != 0)
583 image->entry++;
584
585 *image->entry = IND_DONE;
586
587 return 0;
588}
589
590#define for_each_kimage_entry(image, ptr, entry) \
591 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
592 ptr = (entry & IND_INDIRECTION)? \
593 phys_to_virt((entry & PAGE_MASK)): ptr +1)
594
595static void kimage_free_entry(kimage_entry_t entry)
596{
597 struct page *page;
598
599 page = pfn_to_page(entry >> PAGE_SHIFT);
600 kimage_free_pages(page);
601}
602
603static void kimage_free(struct kimage *image)
604{
605 kimage_entry_t *ptr, entry;
606 kimage_entry_t ind = 0;
607
608 if (!image)
609 return;
610
611 kimage_free_extra_pages(image);
612 for_each_kimage_entry(image, ptr, entry) {
613 if (entry & IND_INDIRECTION) {
614 /* Free the previous indirection page */
615 if (ind & IND_INDIRECTION)
616 kimage_free_entry(ind);
617 /* Save this indirection page until we are
618 * done with it.
619 */
620 ind = entry;
621 }
622 else if (entry & IND_SOURCE)
623 kimage_free_entry(entry);
624 }
625 /* Free the final indirection page */
626 if (ind & IND_INDIRECTION)
627 kimage_free_entry(ind);
628
629 /* Handle any machine specific cleanup */
630 machine_kexec_cleanup(image);
631
632 /* Free the kexec control pages... */
633 kimage_free_page_list(&image->control_pages);
634 kfree(image);
635}
636
637static kimage_entry_t *kimage_dst_used(struct kimage *image,
638 unsigned long page)
639{
640 kimage_entry_t *ptr, entry;
641 unsigned long destination = 0;
642
643 for_each_kimage_entry(image, ptr, entry) {
644 if (entry & IND_DESTINATION)
645 destination = entry & PAGE_MASK;
646 else if (entry & IND_SOURCE) {
647 if (page == destination)
648 return ptr;
649 destination += PAGE_SIZE;
650 }
651 }
652
653 return 0;
654}
655
656static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask,
658 unsigned long destination)
659{
660 /*
661 * Here we implement safeguards to ensure that a source page
662 * is not copied to its destination page before the data on
663 * the destination page is no longer useful.
664 *
665 * To do this we maintain the invariant that a source page is
666 * either its own destination page, or it is not a
667 * destination page at all.
668 *
669 * That is slightly stronger than required, but the proof
670 * that no problems will not occur is trivial, and the
671 * implementation is simply to verify.
672 *
673 * When allocating all pages normally this algorithm will run
674 * in O(N) time, but in the worst case it will run in O(N^2)
675 * time. If the runtime is a problem the data structures can
676 * be fixed.
677 */
678 struct page *page;
679 unsigned long addr;
680
681 /*
682 * Walk through the list of destination pages, and see if I
683 * have a match.
684 */
685 list_for_each_entry(page, &image->dest_pages, lru) {
686 addr = page_to_pfn(page) << PAGE_SHIFT;
687 if (addr == destination) {
688 list_del(&page->lru);
689 return page;
690 }
691 }
692 page = NULL;
693 while (1) {
694 kimage_entry_t *old;
695
696 /* Allocate a page, if we run out of memory give up */
697 page = kimage_alloc_pages(gfp_mask, 0);
698 if (!page)
699 return 0;
700 /* If the page cannot be used file it away */
701 if (page_to_pfn(page) >
702 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
703 list_add(&page->lru, &image->unuseable_pages);
704 continue;
705 }
706 addr = page_to_pfn(page) << PAGE_SHIFT;
707
708 /* If it is the destination page we want use it */
709 if (addr == destination)
710 break;
711
712 /* If the page is not a destination page use it */
713 if (!kimage_is_destination_range(image, addr,
714 addr + PAGE_SIZE))
715 break;
716
717 /*
718 * I know that the page is someones destination page.
719 * See if there is already a source page for this
720 * destination page. And if so swap the source pages.
721 */
722 old = kimage_dst_used(image, addr);
723 if (old) {
724 /* If so move it */
725 unsigned long old_addr;
726 struct page *old_page;
727
728 old_addr = *old & PAGE_MASK;
729 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
730 copy_highpage(page, old_page);
731 *old = addr | (*old & ~PAGE_MASK);
732
733 /* The old page I have found cannot be a
734 * destination page, so return it.
735 */
736 addr = old_addr;
737 page = old_page;
738 break;
739 }
740 else {
741 /* Place the page on the destination list I
742 * will use it later.
743 */
744 list_add(&page->lru, &image->dest_pages);
745 }
746 }
747
748 return page;
749}
750
751static int kimage_load_normal_segment(struct kimage *image,
752 struct kexec_segment *segment)
753{
754 unsigned long maddr;
755 unsigned long ubytes, mbytes;
756 int result;
757 unsigned char *buf;
758
759 result = 0;
760 buf = segment->buf;
761 ubytes = segment->bufsz;
762 mbytes = segment->memsz;
763 maddr = segment->mem;
764
765 result = kimage_set_destination(image, maddr);
766 if (result < 0)
767 goto out;
768
769 while (mbytes) {
770 struct page *page;
771 char *ptr;
772 size_t uchunk, mchunk;
773
774 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
775 if (page == 0) {
776 result = -ENOMEM;
777 goto out;
778 }
779 result = kimage_add_page(image, page_to_pfn(page)
780 << PAGE_SHIFT);
781 if (result < 0)
782 goto out;
783
784 ptr = kmap(page);
785 /* Start with a clear page */
786 memset(ptr, 0, PAGE_SIZE);
787 ptr += maddr & ~PAGE_MASK;
788 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
789 if (mchunk > mbytes)
790 mchunk = mbytes;
791
792 uchunk = mchunk;
793 if (uchunk > ubytes)
794 uchunk = ubytes;
795
796 result = copy_from_user(ptr, buf, uchunk);
797 kunmap(page);
798 if (result) {
799 result = (result < 0) ? result : -EIO;
800 goto out;
801 }
802 ubytes -= uchunk;
803 maddr += mchunk;
804 buf += mchunk;
805 mbytes -= mchunk;
806 }
807out:
808 return result;
809}
810
811static int kimage_load_crash_segment(struct kimage *image,
812 struct kexec_segment *segment)
813{
814 /* For crash dumps kernels we simply copy the data from
815 * user space to it's destination.
816 * We do things a page at a time for the sake of kmap.
817 */
818 unsigned long maddr;
819 unsigned long ubytes, mbytes;
820 int result;
821 unsigned char *buf;
822
823 result = 0;
824 buf = segment->buf;
825 ubytes = segment->bufsz;
826 mbytes = segment->memsz;
827 maddr = segment->mem;
828 while (mbytes) {
829 struct page *page;
830 char *ptr;
831 size_t uchunk, mchunk;
832
833 page = pfn_to_page(maddr >> PAGE_SHIFT);
834 if (page == 0) {
835 result = -ENOMEM;
836 goto out;
837 }
838 ptr = kmap(page);
839 ptr += maddr & ~PAGE_MASK;
840 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
841 if (mchunk > mbytes)
842 mchunk = mbytes;
843
844 uchunk = mchunk;
845 if (uchunk > ubytes) {
846 uchunk = ubytes;
847 /* Zero the trailing part of the page */
848 memset(ptr + uchunk, 0, mchunk - uchunk);
849 }
850 result = copy_from_user(ptr, buf, uchunk);
851 kunmap(page);
852 if (result) {
853 result = (result < 0) ? result : -EIO;
854 goto out;
855 }
856 ubytes -= uchunk;
857 maddr += mchunk;
858 buf += mchunk;
859 mbytes -= mchunk;
860 }
861out:
862 return result;
863}
864
865static int kimage_load_segment(struct kimage *image,
866 struct kexec_segment *segment)
867{
868 int result = -ENOMEM;
869
870 switch (image->type) {
871 case KEXEC_TYPE_DEFAULT:
872 result = kimage_load_normal_segment(image, segment);
873 break;
874 case KEXEC_TYPE_CRASH:
875 result = kimage_load_crash_segment(image, segment);
876 break;
877 }
878
879 return result;
880}
881
882/*
883 * Exec Kernel system call: for obvious reasons only root may call it.
884 *
885 * This call breaks up into three pieces.
886 * - A generic part which loads the new kernel from the current
887 * address space, and very carefully places the data in the
888 * allocated pages.
889 *
890 * - A generic part that interacts with the kernel and tells all of
891 * the devices to shut down. Preventing on-going dmas, and placing
892 * the devices in a consistent state so a later kernel can
893 * reinitialize them.
894 *
895 * - A machine specific part that includes the syscall number
896 * and the copies the image to it's final destination. And
897 * jumps into the image at entry.
898 *
899 * kexec does not sync, or unmount filesystems so if you need
900 * that to happen you need to do that yourself.
901 */
902struct kimage *kexec_image = NULL;
903static struct kimage *kexec_crash_image = NULL;
904/*
905 * A home grown binary mutex.
906 * Nothing can wait so this mutex is safe to use
907 * in interrupt context :)
908 */
909static int kexec_lock = 0;
910
911asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
912 struct kexec_segment __user *segments,
913 unsigned long flags)
914{
915 struct kimage **dest_image, *image;
916 int locked;
917 int result;
918
919 /* We only trust the superuser with rebooting the system. */
920 if (!capable(CAP_SYS_BOOT))
921 return -EPERM;
922
923 /*
924 * Verify we have a legal set of flags
925 * This leaves us room for future extensions.
926 */
927 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
928 return -EINVAL;
929
930 /* Verify we are on the appropriate architecture */
931 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
932 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
933 return -EINVAL;
934
935 /* Put an artificial cap on the number
936 * of segments passed to kexec_load.
937 */
938 if (nr_segments > KEXEC_SEGMENT_MAX)
939 return -EINVAL;
940
941 image = NULL;
942 result = 0;
943
944 /* Because we write directly to the reserved memory
945 * region when loading crash kernels we need a mutex here to
946 * prevent multiple crash kernels from attempting to load
947 * simultaneously, and to prevent a crash kernel from loading
948 * over the top of a in use crash kernel.
949 *
950 * KISS: always take the mutex.
951 */
952 locked = xchg(&kexec_lock, 1);
953 if (locked)
954 return -EBUSY;
955
956 dest_image = &kexec_image;
957 if (flags & KEXEC_ON_CRASH)
958 dest_image = &kexec_crash_image;
959 if (nr_segments > 0) {
960 unsigned long i;
961
962 /* Loading another kernel to reboot into */
963 if ((flags & KEXEC_ON_CRASH) == 0)
964 result = kimage_normal_alloc(&image, entry,
965 nr_segments, segments);
966 /* Loading another kernel to switch to if this one crashes */
967 else if (flags & KEXEC_ON_CRASH) {
968 /* Free any current crash dump kernel before
969 * we corrupt it.
970 */
971 kimage_free(xchg(&kexec_crash_image, NULL));
972 result = kimage_crash_alloc(&image, entry,
973 nr_segments, segments);
974 }
975 if (result)
976 goto out;
977
978 result = machine_kexec_prepare(image);
979 if (result)
980 goto out;
981
982 for (i = 0; i < nr_segments; i++) {
983 result = kimage_load_segment(image, &image->segment[i]);
984 if (result)
985 goto out;
986 }
987 result = kimage_terminate(image);
988 if (result)
989 goto out;
990 }
991 /* Install the new kernel, and Uninstall the old */
992 image = xchg(dest_image, image);
993
994out:
995 xchg(&kexec_lock, 0); /* Release the mutex */
996 kimage_free(image);
997
998 return result;
999}
1000
1001#ifdef CONFIG_COMPAT
1002asmlinkage long compat_sys_kexec_load(unsigned long entry,
1003 unsigned long nr_segments,
1004 struct compat_kexec_segment __user *segments,
1005 unsigned long flags)
1006{
1007 struct compat_kexec_segment in;
1008 struct kexec_segment out, __user *ksegments;
1009 unsigned long i, result;
1010
1011 /* Don't allow clients that don't understand the native
1012 * architecture to do anything.
1013 */
1014 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1015 return -EINVAL;
1016
1017 if (nr_segments > KEXEC_SEGMENT_MAX)
1018 return -EINVAL;
1019
1020 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1021 for (i=0; i < nr_segments; i++) {
1022 result = copy_from_user(&in, &segments[i], sizeof(in));
1023 if (result)
1024 return -EFAULT;
1025
1026 out.buf = compat_ptr(in.buf);
1027 out.bufsz = in.bufsz;
1028 out.mem = in.mem;
1029 out.memsz = in.memsz;
1030
1031 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1032 if (result)
1033 return -EFAULT;
1034 }
1035
1036 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1037}
1038#endif
1039
1040void crash_kexec(struct pt_regs *regs)
1041{
1042 struct kimage *image;
1043 int locked;
1044
1045
1046 /* Take the kexec_lock here to prevent sys_kexec_load
1047 * running on one cpu from replacing the crash kernel
1048 * we are using after a panic on a different cpu.
1049 *
1050 * If the crash kernel was not located in a fixed area
1051 * of memory the xchg(&kexec_crash_image) would be
1052 * sufficient. But since I reuse the memory...
1053 */
1054 locked = xchg(&kexec_lock, 1);
1055 if (!locked) {
1056 image = xchg(&kexec_crash_image, NULL);
1057 if (image) {
1058 machine_crash_shutdown(regs);
1059 machine_kexec(image);
1060 }
1061 xchg(&kexec_lock, 0);
1062 }
1063}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
30KERNEL_ATTR_RO(hotplug_seqnum); 30KERNEL_ATTR_RO(hotplug_seqnum);
31#endif 31#endif
32 32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
37{
38 return sprintf(page, "%p\n", (void *)crash_notes);
39}
40KERNEL_ATTR_RO(crash_notes);
41#endif
42
33decl_subsys(kernel, NULL, NULL); 43decl_subsys(kernel, NULL, NULL);
34EXPORT_SYMBOL_GPL(kernel_subsys); 44EXPORT_SYMBOL_GPL(kernel_subsys);
35 45
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
37#ifdef CONFIG_HOTPLUG 47#ifdef CONFIG_HOTPLUG
38 &hotplug_seqnum_attr.attr, 48 &hotplug_seqnum_attr.attr,
39#endif 49#endif
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif
40 NULL 53 NULL
41}; 54};
42 55
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
18#include <linux/sysrq.h> 18#include <linux/sysrq.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h>
21 22
22int panic_timeout; 23int panic_timeout;
23int panic_on_oops; 24int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
63 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif 65#endif
65 66
67 /*
68 * It's possible to come here directly from a panic-assertion and not
69 * have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though...
71 */
72 preempt_disable();
73
66 bust_spinlocks(1); 74 bust_spinlocks(1);
67 va_start(args, fmt); 75 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 78 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0); 79 bust_spinlocks(0);
72 80
81 /*
82 * If we have crashed and we have a crash kernel loaded let it handle
83 * everything else.
84 * Do we want to call this before we try to display a message?
85 */
86 crash_kexec(NULL);
87
73#ifdef CONFIG_SMP 88#ifdef CONFIG_SMP
89 /*
90 * Note smp_send_stop is the usual smp shutdown function, which
91 * unfortunately means it may not be hardened to work in a panic
92 * situation.
93 */
74 smp_send_stop(); 94 smp_send_stop();
75#endif 95#endif
76 96
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
79 if (!panic_blink) 99 if (!panic_blink)
80 panic_blink = no_blink; 100 panic_blink = no_blink;
81 101
82 if (panic_timeout > 0) 102 if (panic_timeout > 0) {
83 {
84 /* 103 /*
85 * Delay timeout seconds before rebooting the machine. 104 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked.. 105 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
27 like suspend support. 27 like suspend support.
28 28
29config SOFTWARE_SUSPEND 29config SOFTWARE_SUSPEND
30 bool "Software Suspend (EXPERIMENTAL)" 30 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP 31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
32 ---help--- 32 ---help---
33 Enable the possibility of suspending the machine. 33 Enable the possibility of suspending the machine.
34 It doesn't need APM. 34 It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
72 suspended image to. It will simply pick the first available swap 72 suspended image to. It will simply pick the first available swap
73 device. 73 device.
74 74
75config SUSPEND_SMP
76 bool
77 depends on HOTPLUG_CPU && X86 && PM
78 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6swsusp-smp-$(CONFIG_SMP) += smp.o
7
8obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o pm.o
9obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o 7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
8
9obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 10
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
117{ 117{
118 device_resume(); 118 device_resume();
119 platform_finish(); 119 platform_finish();
120 enable_nonboot_cpus();
121 thaw_processes(); 120 thaw_processes();
121 enable_nonboot_cpus();
122 pm_restore_console(); 122 pm_restore_console();
123} 123}
124 124
@@ -131,28 +131,35 @@ static int prepare_processes(void)
131 131
132 sys_sync(); 132 sys_sync();
133 133
134 disable_nonboot_cpus();
135
134 if (freeze_processes()) { 136 if (freeze_processes()) {
135 error = -EBUSY; 137 error = -EBUSY;
136 return error; 138 goto thaw;
137 } 139 }
138 140
139 if (pm_disk_mode == PM_DISK_PLATFORM) { 141 if (pm_disk_mode == PM_DISK_PLATFORM) {
140 if (pm_ops && pm_ops->prepare) { 142 if (pm_ops && pm_ops->prepare) {
141 if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) 143 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
142 return error; 144 goto thaw;
143 } 145 }
144 } 146 }
145 147
146 /* Free memory before shutting down devices. */ 148 /* Free memory before shutting down devices. */
147 free_some_memory(); 149 free_some_memory();
148
149 return 0; 150 return 0;
151thaw:
152 thaw_processes();
153 enable_nonboot_cpus();
154 pm_restore_console();
155 return error;
150} 156}
151 157
152static void unprepare_processes(void) 158static void unprepare_processes(void)
153{ 159{
154 enable_nonboot_cpus(); 160 platform_finish();
155 thaw_processes(); 161 thaw_processes();
162 enable_nonboot_cpus();
156 pm_restore_console(); 163 pm_restore_console();
157} 164}
158 165
@@ -160,15 +167,9 @@ static int prepare_devices(void)
160{ 167{
161 int error; 168 int error;
162 169
163 disable_nonboot_cpus(); 170 if ((error = device_suspend(PMSG_FREEZE)))
164 if ((error = device_suspend(PMSG_FREEZE))) {
165 printk("Some devices failed to suspend\n"); 171 printk("Some devices failed to suspend\n");
166 platform_finish(); 172 return error;
167 enable_nonboot_cpus();
168 return error;
169 }
170
171 return 0;
172} 173}
173 174
174/** 175/**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
185 int error; 186 int error;
186 187
187 error = prepare_processes(); 188 error = prepare_processes();
188 if (!error) { 189 if (error)
189 error = prepare_devices(); 190 return error;
190 } 191 error = prepare_devices();
191 192
192 if (error) { 193 if (error) {
193 unprepare_processes(); 194 unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
250 251
251 if ((error = prepare_processes())) { 252 if ((error = prepare_processes())) {
252 swsusp_close(); 253 swsusp_close();
253 goto Cleanup; 254 goto Done;
254 } 255 }
255 256
256 pr_debug("PM: Reading swsusp image.\n"); 257 pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
55 55
56 pm_prepare_console(); 56 pm_prepare_console();
57 57
58 disable_nonboot_cpus();
59
60 if (num_online_cpus() != 1) {
61 error = -EPERM;
62 goto Enable_cpu;
63 }
64
58 if (freeze_processes()) { 65 if (freeze_processes()) {
59 error = -EAGAIN; 66 error = -EAGAIN;
60 goto Thaw; 67 goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
75 pm_ops->finish(state); 82 pm_ops->finish(state);
76 Thaw: 83 Thaw:
77 thaw_processes(); 84 thaw_processes();
85 Enable_cpu:
86 enable_nonboot_cpus();
78 pm_restore_console(); 87 pm_restore_console();
79 return error; 88 return error;
80} 89}
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
113 if (pm_ops && pm_ops->finish) 122 if (pm_ops && pm_ops->finish)
114 pm_ops->finish(state); 123 pm_ops->finish(state);
115 thaw_processes(); 124 thaw_processes();
125 enable_nonboot_cpus();
116 pm_restore_console(); 126 pm_restore_console();
117} 127}
118 128
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
150 goto Unlock; 160 goto Unlock;
151 } 161 }
152 162
153 /* Suspend is hard to get right on SMP. */
154 if (num_online_cpus() != 1) {
155 error = -EPERM;
156 goto Unlock;
157 }
158
159 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 163 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
160 if ((error = suspend_prepare(state))) 164 if ((error = suspend_prepare(state)))
161 goto Unlock; 165 goto Unlock;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
32} 32}
33 33
34/* Refrigerator is place where frozen processes are stored :-). */ 34/* Refrigerator is place where frozen processes are stored :-). */
35void refrigerator(unsigned long flag) 35void refrigerator(void)
36{ 36{
37 /* Hmm, should we be allowed to suspend when there are realtime 37 /* Hmm, should we be allowed to suspend when there are realtime
38 processes around? */ 38 processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
41 current->state = TASK_UNINTERRUPTIBLE; 41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 42 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 43 printk("=");
44 current->flags &= ~PF_FREEZE;
45 44
45 frozen_process(current);
46 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 48 spin_unlock_irq(&current->sighand->siglock);
49 49
50 current->flags |= PF_FROZEN; 50 while (frozen(current))
51 while (current->flags & PF_FROZEN)
52 schedule(); 51 schedule();
53 pr_debug("%s left refrigerator\n", current->comm); 52 pr_debug("%s left refrigerator\n", current->comm);
54 current->state = save; 53 current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
57/* 0 = success, else # of processes that we failed to stop */ 56/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 57int freeze_processes(void)
59{ 58{
60 int todo; 59 int todo;
61 unsigned long start_time; 60 unsigned long start_time;
62 struct task_struct *g, *p; 61 struct task_struct *g, *p;
63 62
64 printk( "Stopping tasks: " ); 63 printk( "Stopping tasks: " );
65 start_time = jiffies; 64 start_time = jiffies;
66 do { 65 do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
70 unsigned long flags; 69 unsigned long flags;
71 if (!freezeable(p)) 70 if (!freezeable(p))
72 continue; 71 continue;
73 if ((p->flags & PF_FROZEN) || 72 if ((frozen(p)) ||
74 (p->state == TASK_TRACED) || 73 (p->state == TASK_TRACED) ||
75 (p->state == TASK_STOPPED)) 74 (p->state == TASK_STOPPED))
76 continue; 75 continue;
77 76
78 /* FIXME: smp problem here: we may not access other process' flags 77 freeze(p);
79 without locking */
80 p->flags |= PF_FREEZE;
81 spin_lock_irqsave(&p->sighand->siglock, flags); 78 spin_lock_irqsave(&p->sighand->siglock, flags);
82 signal_wake_up(p, 0); 79 signal_wake_up(p, 0);
83 spin_unlock_irqrestore(&p->sighand->siglock, flags); 80 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
91 return todo; 88 return todo;
92 } 89 }
93 } while(todo); 90 } while(todo);
94 91
95 printk( "|\n" ); 92 printk( "|\n" );
96 BUG_ON(in_atomic()); 93 BUG_ON(in_atomic());
97 return 0; 94 return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
106 do_each_thread(g, p) { 103 do_each_thread(g, p) {
107 if (!freezeable(p)) 104 if (!freezeable(p))
108 continue; 105 continue;
109 if (p->flags & PF_FROZEN) { 106 if (!thaw_process(p))
110 p->flags &= ~PF_FROZEN;
111 wake_up_process(p);
112 } else
113 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 107 printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
114 } while_each_thread(g, p); 108 } while_each_thread(g, p);
115 109
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 457c2302ed42..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/suspend.h> 14#include <linux/suspend.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/cpu.h>
16#include <asm/atomic.h> 17#include <asm/atomic.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19static atomic_t cpu_counter, freeze; 20/* This is protected by pm_sem semaphore */
20 21static cpumask_t frozen_cpus;
21
22static void smp_pause(void * data)
23{
24 struct saved_context ctxt;
25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n");
27 dump_stack();
28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and
32 preserve general-purpose registers, too
33
34 What about stack? We may need to move to new stack here.
35
36 This should better be ran with interrupts disabled.
37 */
38 cpu_relax();
39 barrier();
40 }
41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt);
43}
44
45static cpumask_t oldmask;
46 22
47void disable_nonboot_cpus(void) 23void disable_nonboot_cpus(void)
48{ 24{
49 oldmask = current->cpus_allowed; 25 int cpu, error;
50 set_cpus_allowed(current, cpumask_of_cpu(0));
51 printk("Freezing CPUs (at %d)", raw_smp_processor_id());
52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ);
54 printk("...");
55 BUG_ON(raw_smp_processor_id() != 0);
56
57 /* FIXME: for this to work, all the CPUs must be running
58 * "idle" thread (or we deadlock). Is that guaranteed? */
59 26
60 atomic_set(&cpu_counter, 0); 27 error = 0;
61 atomic_set(&freeze, 1); 28 cpus_clear(frozen_cpus);
62 smp_call_function(smp_pause, NULL, 0, 0); 29 printk("Freezing cpus ...\n");
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { 30 for_each_online_cpu(cpu) {
64 cpu_relax(); 31 if (cpu == 0)
65 barrier(); 32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
66 } 40 }
67 printk("ok\n"); 41 BUG_ON(smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
68} 44}
69 45
70void enable_nonboot_cpus(void) 46void enable_nonboot_cpus(void)
71{ 47{
72 printk("Restarting CPUs"); 48 int cpu, error;
73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) {
75 cpu_relax();
76 barrier();
77 }
78 printk("...");
79 set_cpus_allowed(current, oldmask);
80 schedule();
81 printk("ok\n");
82 49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) {
56 printk("CPU%d is up\n", cpu);
57 continue;
58 }
59 printk("Error taking cpu %d up: %d\n", cpu, error);
60 panic("Not enough cpus");
61 }
62 cpus_clear(frozen_cpus);
83} 63}
84 64
85
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 53f9f8720ee4..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
11 * 11 *
12 * I'd like to thank the following people for their work: 12 * I'd like to thank the following people for their work:
13 * 13 *
14 * Pavel Machek <pavel@ucw.cz>: 14 * Pavel Machek <pavel@ucw.cz>:
15 * Modifications, defectiveness pointing, being with me at the very beginning, 15 * Modifications, defectiveness pointing, being with me at the very beginning,
16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. 16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17 * 17 *
18 * Steve Doddi <dirk@loth.demon.co.uk>: 18 * Steve Doddi <dirk@loth.demon.co.uk>:
19 * Support the possibility of hardware state restoring. 19 * Support the possibility of hardware state restoring.
20 * 20 *
21 * Raph <grey.havens@earthling.net>: 21 * Raph <grey.havens@earthling.net>:
@@ -84,11 +84,11 @@ extern char resume_file[];
84static unsigned int nr_copy_pages __nosavedata = 0; 84static unsigned int nr_copy_pages __nosavedata = 0;
85 85
86/* Suspend pagedir is allocated before final copy, therefore it 86/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume 87 must be freed after resume
88 88
89 Warning: this is evil. There are actually two pagedirs at time of 89 Warning: this is evil. There are actually two pagedirs at time of
90 resume. One is "pagedir_save", which is empty frame allocated at 90 resume. One is "pagedir_save", which is empty frame allocated at
91 time of suspend, that must be freed. Second is "pagedir_nosave", 91 time of suspend, that must be freed. Second is "pagedir_nosave",
92 allocated at time of resume, that travels through memory not to 92 allocated at time of resume, that travels through memory not to
93 collide with anything. 93 collide with anything.
94 94
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
132{ 132{
133 int error; 133 int error;
134 134
135 rw_swap_page_sync(READ, 135 rw_swap_page_sync(READ,
136 swp_entry(root_swap, 0), 136 swp_entry(root_swap, 0),
137 virt_to_page((unsigned long)&swsusp_header)); 137 virt_to_page((unsigned long)&swsusp_header));
138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
142 swsusp_header.swsusp_info = prev; 142 swsusp_header.swsusp_info = prev;
143 error = rw_swap_page_sync(WRITE, 143 error = rw_swap_page_sync(WRITE,
144 swp_entry(root_swap, 0), 144 swp_entry(root_swap, 0),
145 virt_to_page((unsigned long) 145 virt_to_page((unsigned long)
146 &swsusp_header)); 146 &swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
174static int swsusp_swap_check(void) /* This is called before saving image */ 174static int swsusp_swap_check(void) /* This is called before saving image */
175{ 175{
176 int i, len; 176 int i, len;
177 177
178 len=strlen(resume_file); 178 len=strlen(resume_file);
179 root_swap = 0xFFFF; 179 root_swap = 0xFFFF;
180 180
181 swap_list_lock(); 181 swap_list_lock();
182 for(i=0; i<MAX_SWAPFILES; i++) { 182 for (i=0; i<MAX_SWAPFILES; i++) {
183 if (swap_info[i].flags == 0) { 183 if (swap_info[i].flags == 0) {
184 swapfile_used[i]=SWAPFILE_UNUSED; 184 swapfile_used[i]=SWAPFILE_UNUSED;
185 } else { 185 } else {
186 if(!len) { 186 if (!len) {
187 printk(KERN_WARNING "resume= option should be used to set suspend device" ); 187 printk(KERN_WARNING "resume= option should be used to set suspend device" );
188 if(root_swap == 0xFFFF) { 188 if (root_swap == 0xFFFF) {
189 swapfile_used[i] = SWAPFILE_SUSPEND; 189 swapfile_used[i] = SWAPFILE_SUSPEND;
190 root_swap = i; 190 root_swap = i;
191 } else 191 } else
192 swapfile_used[i] = SWAPFILE_IGNORED; 192 swapfile_used[i] = SWAPFILE_IGNORED;
193 } else { 193 } else {
194 /* we ignore all swap devices that are not the resume_file */ 194 /* we ignore all swap devices that are not the resume_file */
195 if (is_resume_device(&swap_info[i])) { 195 if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
209 * This is called after saving image so modification 209 * This is called after saving image so modification
210 * will be lost after resume... and that's what we want. 210 * will be lost after resume... and that's what we want.
211 * we make the device unusable. A new call to 211 * we make the device unusable. A new call to
212 * lock_swapdevices can unlock the devices. 212 * lock_swapdevices can unlock the devices.
213 */ 213 */
214static void lock_swapdevices(void) 214static void lock_swapdevices(void)
215{ 215{
216 int i; 216 int i;
217 217
218 swap_list_lock(); 218 swap_list_lock();
219 for(i = 0; i< MAX_SWAPFILES; i++) 219 for (i = 0; i< MAX_SWAPFILES; i++)
220 if(swapfile_used[i] == SWAPFILE_IGNORED) { 220 if (swapfile_used[i] == SWAPFILE_IGNORED) {
221 swap_info[i].flags ^= 0xFF; 221 swap_info[i].flags ^= 0xFF;
222 } 222 }
223 swap_list_unlock(); 223 swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
229 * @loc: Place to store the entry we used. 229 * @loc: Place to store the entry we used.
230 * 230 *
231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO 231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
232 * errors. That is an artifact left over from swsusp. It did not 232 * errors. That is an artifact left over from swsusp. It did not
233 * check the return of rw_swap_page_sync() at all, since most pages 233 * check the return of rw_swap_page_sync() at all, since most pages
234 * written back to swap would return -EIO. 234 * written back to swap would return -EIO.
235 * This is a partial improvement, since we will at least return other 235 * This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
241 int error = 0; 241 int error = 0;
242 242
243 entry = get_swap_page(); 243 entry = get_swap_page();
244 if (swp_offset(entry) && 244 if (swp_offset(entry) &&
245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
246 error = rw_swap_page_sync(WRITE, entry, 246 error = rw_swap_page_sync(WRITE, entry,
247 virt_to_page(addr)); 247 virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
257/** 257/**
258 * data_free - Free the swap entries used by the saved image. 258 * data_free - Free the swap entries used by the saved image.
259 * 259 *
260 * Walk the list of used swap entries and free each one. 260 * Walk the list of used swap entries and free each one.
261 * This is only used for cleanup when suspend fails. 261 * This is only used for cleanup when suspend fails.
262 */ 262 */
263static void data_free(void) 263static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
290 mod = 1; 290 mod = 1;
291 291
292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
293 for_each_pbe(p, pagedir_nosave) { 293 for_each_pbe (p, pagedir_nosave) {
294 if (!(i%mod)) 294 if (!(i%mod))
295 printk( "\b\b\b\b%3d%%", i / mod ); 295 printk( "\b\b\b\b%3d%%", i / mod );
296 if ((error = write_page(p->address, &(p->swap_address)))) 296 if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
335 335
336 dump_info(); 336 dump_info();
337 error = write_page((unsigned long)&swsusp_info, &entry); 337 error = write_page((unsigned long)&swsusp_info, &entry);
338 if (!error) { 338 if (!error) {
339 printk( "S" ); 339 printk( "S" );
340 error = mark_swapfiles(entry); 340 error = mark_swapfiles(entry);
341 printk( "|\n" ); 341 printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
370 struct pbe * pbe; 370 struct pbe * pbe;
371 371
372 printk( "Writing pagedir..."); 372 printk( "Writing pagedir...");
373 for_each_pb_page(pbe, pagedir_nosave) { 373 for_each_pb_page (pbe, pagedir_nosave) {
374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
375 return error; 375 return error;
376 } 376 }
@@ -472,7 +472,7 @@ static int save_highmem(void)
472 int res = 0; 472 int res = 0;
473 473
474 pr_debug("swsusp: Saving Highmem\n"); 474 pr_debug("swsusp: Saving Highmem\n");
475 for_each_zone(zone) { 475 for_each_zone (zone) {
476 if (is_highmem(zone)) 476 if (is_highmem(zone))
477 res = save_highmem_zone(zone); 477 res = save_highmem_zone(zone);
478 if (res) 478 if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
547 547
548 nr_copy_pages = 0; 548 nr_copy_pages = 0;
549 549
550 for_each_zone(zone) { 550 for_each_zone (zone) {
551 if (is_highmem(zone)) 551 if (is_highmem(zone))
552 continue; 552 continue;
553 mark_free_pages(zone); 553 mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
562 struct zone *zone; 562 struct zone *zone;
563 unsigned long zone_pfn; 563 unsigned long zone_pfn;
564 struct pbe * pbe = pagedir_nosave; 564 struct pbe * pbe = pagedir_nosave;
565 565
566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); 566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
567 for_each_zone(zone) { 567 for_each_zone (zone) {
568 if (is_highmem(zone)) 568 if (is_highmem(zone))
569 continue; 569 continue;
570 mark_free_pages(zone); 570 mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
702{ 702{
703 struct pbe * p; 703 struct pbe * p;
704 704
705 for_each_pbe(p, pagedir_save) { 705 for_each_pbe (p, pagedir_save) {
706 if (p->address) { 706 if (p->address) {
707 ClearPageNosave(virt_to_page(p->address)); 707 ClearPageNosave(virt_to_page(p->address));
708 free_page(p->address); 708 free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
719{ 719{
720 struct pbe * p; 720 struct pbe * p;
721 721
722 for_each_pbe(p, pagedir_save) { 722 for_each_pbe (p, pagedir_save) {
723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); 723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
724 if (!p->address) 724 if (!p->address)
725 return -ENOMEM; 725 return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
740/** 740/**
741 * enough_free_mem - Make sure we enough free memory to snapshot. 741 * enough_free_mem - Make sure we enough free memory to snapshot.
742 * 742 *
743 * Returns TRUE or FALSE after checking the number of available 743 * Returns TRUE or FALSE after checking the number of available
744 * free pages. 744 * free pages.
745 */ 745 */
746 746
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
758/** 758/**
759 * enough_swap - Make sure we have enough swap to save the image. 759 * enough_swap - Make sure we have enough swap to save the image.
760 * 760 *
761 * Returns TRUE or FALSE after checking the total amount of swap 761 * Returns TRUE or FALSE after checking the total amount of swap
762 * space avaiable. 762 * space avaiable.
763 * 763 *
764 * FIXME: si_swapinfo(&i) returns all swap devices information. 764 * FIXME: si_swapinfo(&i) returns all swap devices information.
765 * We should only consider resume_device. 765 * We should only consider resume_device.
766 */ 766 */
767 767
768static int enough_swap(void) 768static int enough_swap(void)
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
781{ 781{
782 int error; 782 int error;
783 783
784 pagedir_nosave = NULL;
785 nr_copy_pages = calc_nr(nr_copy_pages);
786
784 pr_debug("suspend: (pages needed: %d + %d free: %d)\n", 787 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
785 nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); 788 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
786 789
787 pagedir_nosave = NULL;
788 if (!enough_free_mem()) 790 if (!enough_free_mem())
789 return -ENOMEM; 791 return -ENOMEM;
790 792
791 if (!enough_swap()) 793 if (!enough_swap())
792 return -ENOSPC; 794 return -ENOSPC;
793 795
794 nr_copy_pages = calc_nr(nr_copy_pages);
795
796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { 796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
798 return -ENOMEM; 798 return -ENOMEM;
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
827 error = swsusp_alloc(); 827 error = swsusp_alloc();
828 if (error) 828 if (error)
829 return error; 829 return error;
830 830
831 /* During allocating of suspend pagedir, new cold pages may appear. 831 /* During allocating of suspend pagedir, new cold pages may appear.
832 * Kill them. 832 * Kill them.
833 */ 833 */
834 drain_local_pages(); 834 drain_local_pages();
@@ -929,21 +929,6 @@ int swsusp_resume(void)
929 return error; 929 return error;
930} 930}
931 931
932/* More restore stuff */
933
934/*
935 * Returns true if given address/order collides with any orig_address
936 */
937static int does_collide_order(unsigned long addr, int order)
938{
939 int i;
940
941 for (i=0; i < (1<<order); i++)
942 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
943 return 1;
944 return 0;
945}
946
947/** 932/**
948 * On resume, for storing the PBE list and the image, 933 * On resume, for storing the PBE list and the image,
949 * we can only use memory pages that do not conflict with the pages 934 * we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
973 unsigned long m; 958 unsigned long m;
974 959
975 m = get_zeroed_page(gfp_mask); 960 m = get_zeroed_page(gfp_mask);
976 while (does_collide_order(m, 0)) { 961 while (!PageNosaveFree(virt_to_page(m))) {
977 eat_page((void *)m); 962 eat_page((void *)m);
978 m = get_zeroed_page(gfp_mask); 963 m = get_zeroed_page(gfp_mask);
979 if (!m) 964 if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1045 1030
1046 /* Set page flags */ 1031 /* Set page flags */
1047 1032
1048 for_each_zone(zone) { 1033 for_each_zone (zone) {
1049 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1034 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1050 SetPageNosaveFree(pfn_to_page(zone_pfn + 1035 SetPageNosaveFree(pfn_to_page(zone_pfn +
1051 zone->zone_start_pfn)); 1036 zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1061 /* Relocate colliding pages */ 1046 /* Relocate colliding pages */
1062 1047
1063 for_each_pb_page (pbpage, pblist) { 1048 for_each_pb_page (pbpage, pblist) {
1064 if (does_collide_order((unsigned long)pbpage, 0)) { 1049 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1065 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); 1050 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1066 if (!m) { 1051 if (!m) {
1067 error = -ENOMEM; 1052 error = -ENOMEM;
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
1193 return "version"; 1178 return "version";
1194 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1195 return "machine"; 1180 return "machine";
1181#if 0
1196 if(swsusp_info.cpus != num_online_cpus()) 1182 if(swsusp_info.cpus != num_online_cpus())
1197 return "number of cpus"; 1183 return "number of cpus";
1184#endif
1198 return NULL; 1185 return NULL;
1199} 1186}
1200 1187
diff --git a/kernel/printk.c b/kernel/printk.c
index 3a442bfb8bee..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
588 log_level_unknown = 1; 588 log_level_unknown = 1;
589 } 589 }
590 590
591 if (!cpu_online(smp_processor_id()) && 591 if (!cpu_online(smp_processor_id())) {
592 system_state != SYSTEM_RUNNING) {
593 /* 592 /*
594 * Some console drivers may assume that per-cpu resources have 593 * Some console drivers may assume that per-cpu resources have
595 * been allocated. So don't allow them to be called by this 594 * been allocated. So don't allow them to be called by this
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
263 new->start = min; 263 new->start = min;
264 if (new->end > max) 264 if (new->end > max)
265 new->end = max; 265 new->end = max;
266 new->start = (new->start + align - 1) & ~(align - 1); 266 new->start = ALIGN(new->start, align);
267 if (alignf) 267 if (alignf)
268 alignf(alignf_data, new, size, align); 268 alignf(alignf_data, new, size, align);
269 if (new->start < new->end && new->end - new->start >= size - 1) { 269 if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..a07cff90d849 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
166#define SCALE_PRIO(x, prio) \ 166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169static inline unsigned int task_timeslice(task_t *p) 169static unsigned int task_timeslice(task_t *p)
170{ 170{
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -260,22 +260,86 @@ struct runqueue {
260 260
261static DEFINE_PER_CPU(struct runqueue, runqueues); 261static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See detach_destroy_domains: synchronize_sched for details.
266 *
267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections.
269 */
263#define for_each_domain(cpu, domain) \ 270#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 271for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
265 272
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 273#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues)) 274#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p)) 275#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 276#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 277
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch 278#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0) 279# define prepare_arch_switch(next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 280#endif
277# define task_running(rq, p) ((rq)->curr == (p)) 281#ifndef finish_arch_switch
282# define finish_arch_switch(prev) do { } while (0)
283#endif
284
285#ifndef __ARCH_WANT_UNLOCKED_CTXSW
286static inline int task_running(runqueue_t *rq, task_t *p)
287{
288 return rq->curr == p;
289}
290
291static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
292{
293}
294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{
297 spin_unlock_irq(&rq->lock);
298}
299
300#else /* __ARCH_WANT_UNLOCKED_CTXSW */
301static inline int task_running(runqueue_t *rq, task_t *p)
302{
303#ifdef CONFIG_SMP
304 return p->oncpu;
305#else
306 return rq->curr == p;
307#endif
308}
309
310static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
311{
312#ifdef CONFIG_SMP
313 /*
314 * We can optimise this out completely for !SMP, because the
315 * SMP rebalancing from interrupt is the only thing that cares
316 * here.
317 */
318 next->oncpu = 1;
319#endif
320#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
321 spin_unlock_irq(&rq->lock);
322#else
323 spin_unlock(&rq->lock);
324#endif
325}
326
327static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
328{
329#ifdef CONFIG_SMP
330 /*
331 * After ->oncpu is cleared, the task can be moved to a different CPU.
332 * We must ensure this doesn't happen until the switch is completely
333 * finished.
334 */
335 smp_wmb();
336 prev->oncpu = 0;
278#endif 337#endif
338#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
339 local_irq_enable();
340#endif
341}
342#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
279 343
280/* 344/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 345 * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
309 * bump this up when changing the output format or the meaning of an existing 373 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 374 * format, so that tools can adapt (or abort)
311 */ 375 */
312#define SCHEDSTAT_VERSION 11 376#define SCHEDSTAT_VERSION 12
313 377
314static int show_schedstat(struct seq_file *seq, void *v) 378static int show_schedstat(struct seq_file *seq, void *v)
315{ 379{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
338 402
339#ifdef CONFIG_SMP 403#ifdef CONFIG_SMP
340 /* domain-specific stats */ 404 /* domain-specific stats */
405 preempt_disable();
341 for_each_domain(cpu, sd) { 406 for_each_domain(cpu, sd) {
342 enum idle_type itype; 407 enum idle_type itype;
343 char mask_str[NR_CPUS]; 408 char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
356 sd->lb_nobusyq[itype], 421 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 422 sd->lb_nobusyg[itype]);
358 } 423 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 424 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 425 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 426 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
427 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 428 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 429 }
430 preempt_enable();
364#endif 431#endif
365 } 432 }
366 return 0; 433 return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
414 return rq; 481 return rq;
415} 482}
416 483
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS 484#ifdef CONFIG_SCHEDSTATS
434/* 485/*
435 * Called when a process is dequeued from the active array and given 486 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
622 rq->nr_running++; 673 rq->nr_running++;
623} 674}
624 675
625static void recalc_task_prio(task_t *p, unsigned long long now) 676static int recalc_task_prio(task_t *p, unsigned long long now)
626{ 677{
627 /* Caller must always ensure 'now >= p->timestamp' */ 678 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 679 unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
681 } 732 }
682 } 733 }
683 734
684 p->prio = effective_prio(p); 735 return effective_prio(p);
685} 736}
686 737
687/* 738/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
704 } 755 }
705#endif 756#endif
706 757
707 recalc_task_prio(p, now); 758 p->prio = recalc_task_prio(p, now);
708 759
709 /* 760 /*
710 * This checks to make sure it's not an uninterruptible task 761 * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
782} 833}
783 834
784#ifdef CONFIG_SMP 835#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct { 836typedef struct {
791 struct list_head list; 837 struct list_head list;
792 enum request_type type;
793 838
794 /* For REQ_MOVE_TASK */
795 task_t *task; 839 task_t *task;
796 int dest_cpu; 840 int dest_cpu;
797 841
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done; 842 struct completion done;
802} migration_req_t; 843} migration_req_t;
803 844
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
819 } 860 }
820 861
821 init_completion(&req->done); 862 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p; 863 req->task = p;
824 req->dest_cpu = dest_cpu; 864 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 865 list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 926 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 927 * balance conservatively.
888 */ 928 */
889static inline unsigned long source_load(int cpu) 929static inline unsigned long source_load(int cpu, int type)
890{ 930{
891 runqueue_t *rq = cpu_rq(cpu); 931 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 932 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
933 if (type == 0)
934 return load_now;
893 935
894 return min(rq->cpu_load, load_now); 936 return min(rq->cpu_load[type-1], load_now);
895} 937}
896 938
897/* 939/*
898 * Return a high guess at the load of a migration-target cpu 940 * Return a high guess at the load of a migration-target cpu
899 */ 941 */
900static inline unsigned long target_load(int cpu) 942static inline unsigned long target_load(int cpu, int type)
901{ 943{
902 runqueue_t *rq = cpu_rq(cpu); 944 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0)
947 return load_now;
904 948
905 return max(rq->cpu_load, load_now); 949 return max(rq->cpu_load[type-1], load_now);
906} 950}
907 951
908#endif 952/*
953 * find_idlest_group finds and returns the least busy CPU group within the
954 * domain.
955 */
956static struct sched_group *
957find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
958{
959 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
960 unsigned long min_load = ULONG_MAX, this_load = 0;
961 int load_idx = sd->forkexec_idx;
962 int imbalance = 100 + (sd->imbalance_pct-100)/2;
963
964 do {
965 unsigned long load, avg_load;
966 int local_group;
967 int i;
968
969 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971
972 /* Tally up the load of all CPUs in the group */
973 avg_load = 0;
974
975 for_each_cpu_mask(i, group->cpumask) {
976 /* Bias balancing toward cpus of our domain */
977 if (local_group)
978 load = source_load(i, load_idx);
979 else
980 load = target_load(i, load_idx);
981
982 avg_load += load;
983 }
984
985 /* Adjust by relative CPU power of the group */
986 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
987
988 if (local_group) {
989 this_load = avg_load;
990 this = group;
991 } else if (avg_load < min_load) {
992 min_load = avg_load;
993 idlest = group;
994 }
995 group = group->next;
996 } while (group != sd->groups);
997
998 if (!idlest || 100*this_load < imbalance*min_load)
999 return NULL;
1000 return idlest;
1001}
1002
1003/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu)
1007{
1008 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1;
1010 int i;
1011
1012 for_each_cpu_mask(i, group->cpumask) {
1013 load = source_load(i, 0);
1014
1015 if (load < min_load || (load == min_load && i == this_cpu)) {
1016 min_load = load;
1017 idlest = i;
1018 }
1019 }
1020
1021 return idlest;
1022}
1023
1024/*
1025 * sched_balance_self: balance the current task (running on cpu) in domains
1026 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1027 * SD_BALANCE_EXEC.
1028 *
1029 * Balance, ie. select the least loaded group.
1030 *
1031 * Returns the target CPU number, or the same CPU if no balancing is needed.
1032 *
1033 * preempt must be disabled.
1034 */
1035static int sched_balance_self(int cpu, int flag)
1036{
1037 struct task_struct *t = current;
1038 struct sched_domain *tmp, *sd = NULL;
1039
1040 for_each_domain(cpu, tmp)
1041 if (tmp->flags & flag)
1042 sd = tmp;
1043
1044 while (sd) {
1045 cpumask_t span;
1046 struct sched_group *group;
1047 int new_cpu;
1048 int weight;
1049
1050 span = sd->span;
1051 group = find_idlest_group(sd, t, cpu);
1052 if (!group)
1053 goto nextlevel;
1054
1055 new_cpu = find_idlest_cpu(group, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel;
1058
1059 /* Now try balancing at a lower domain level */
1060 cpu = new_cpu;
1061nextlevel:
1062 sd = NULL;
1063 weight = cpus_weight(span);
1064 for_each_domain(cpu, tmp) {
1065 if (weight <= cpus_weight(tmp->span))
1066 break;
1067 if (tmp->flags & flag)
1068 sd = tmp;
1069 }
1070 /* while loop will break here if sd == NULL */
1071 }
1072
1073 return cpu;
1074}
1075
1076#endif /* CONFIG_SMP */
909 1077
910/* 1078/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 1079 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
927 1095
928 for_each_domain(cpu, sd) { 1096 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 1097 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 1098 cpus_and(tmp, sd->span, p->cpus_allowed);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 1099 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 1100 if (idle_cpu(i))
934 return i; 1101 return i;
935 } 1102 }
936 } 1103 }
937 else break; 1104 else
1105 break;
938 } 1106 }
939 return cpu; 1107 return cpu;
940} 1108}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 1135 runqueue_t *rq;
968#ifdef CONFIG_SMP 1136#ifdef CONFIG_SMP
969 unsigned long load, this_load; 1137 unsigned long load, this_load;
970 struct sched_domain *sd; 1138 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 1139 int new_cpu;
972#endif 1140#endif
973 1141
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 1154 if (unlikely(task_running(rq, p)))
987 goto out_activate; 1155 goto out_activate;
988 1156
989#ifdef CONFIG_SCHEDSTATS 1157 new_cpu = cpu;
1158
990 schedstat_inc(rq, ttwu_cnt); 1159 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 1160 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 1161 schedstat_inc(rq, ttwu_local);
993 } else { 1162 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 1163 }
995 if (cpu_isset(cpu, sd->span)) { 1164
996 schedstat_inc(sd, ttwu_wake_remote); 1165 for_each_domain(this_cpu, sd) {
997 break; 1166 if (cpu_isset(cpu, sd->span)) {
998 } 1167 schedstat_inc(sd, ttwu_wake_remote);
1168 this_sd = sd;
1169 break;
999 } 1170 }
1000 } 1171 }
1001#endif
1002 1172
1003 new_cpu = cpu; 1173 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1174 goto out_set_cpu;
1006 1175
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1176 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1177 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1178 */
1014 if (sync) 1179 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1180 int idx = this_sd->wake_idx;
1181 unsigned int imbalance;
1016 1182
1017 /* Don't pull the task off an idle CPU to a busy one */ 1183 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1184
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1185 load = source_load(cpu, idx);
1186 this_load = target_load(this_cpu, idx);
1022 1187
1023 /* 1188 new_cpu = this_cpu; /* Wake to this CPU if we can */
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1189
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1190 if (this_sd->flags & SD_WAKE_AFFINE) {
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1191 unsigned long tl = this_load;
1037 /* 1192 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1193 * If sync wakeup then subtract the (maximum possible)
1039 * in this domain. 1194 * effect of the currently running task from the load
1195 * of the current CPU:
1040 */ 1196 */
1041 if (cpu_isset(cpu, sd->span)) { 1197 if (sync)
1042 schedstat_inc(sd, ttwu_move_affine); 1198 tl -= SCHED_LOAD_SCALE;
1199
1200 if ((tl <= load &&
1201 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1202 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1203 /*
1204 * This domain has SD_WAKE_AFFINE and
1205 * p is cache cold in this domain, and
1206 * there is no bad imbalance.
1207 */
1208 schedstat_inc(this_sd, ttwu_move_affine);
1043 goto out_set_cpu; 1209 goto out_set_cpu;
1044 } 1210 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1211 }
1046 imbalance*this_load <= 100*load) { 1212
1047 /* 1213 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1214 * Start passive balancing when half the imbalance_pct
1049 * an imbalance. 1215 * limit is reached.
1050 */ 1216 */
1051 if (cpu_isset(cpu, sd->span)) { 1217 if (this_sd->flags & SD_WAKE_BALANCE) {
1052 schedstat_inc(sd, ttwu_move_balance); 1218 if (imbalance*this_load <= 100*load) {
1219 schedstat_inc(this_sd, ttwu_move_balance);
1053 goto out_set_cpu; 1220 goto out_set_cpu;
1054 } 1221 }
1055 } 1222 }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1120 return try_to_wake_up(p, state, 0); 1287 return try_to_wake_up(p, state, 0);
1121} 1288}
1122 1289
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/* 1290/*
1129 * Perform scheduler related setup for a newly forked process p. 1291 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1292 * p is forked by current.
1131 */ 1293 */
1132void fastcall sched_fork(task_t *p) 1294void fastcall sched_fork(task_t *p, int clone_flags)
1133{ 1295{
1296 int cpu = get_cpu();
1297
1298#ifdef CONFIG_SMP
1299 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1300#endif
1301 set_task_cpu(p, cpu);
1302
1134 /* 1303 /*
1135 * We mark the process as running here, but have not actually 1304 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1305 * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
1140 p->state = TASK_RUNNING; 1309 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1310 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1311 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS 1312#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1313 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif 1314#endif
1315#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1316 p->oncpu = 0;
1317#endif
1147#ifdef CONFIG_PREEMPT 1318#ifdef CONFIG_PREEMPT
1148 /* 1319 /* Want to start with kernel preemption disabled. */
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1; 1320 p->thread_info->preempt_count = 1;
1155#endif 1321#endif
1156 /* 1322 /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
1174 * runqueue lock is not a problem. 1340 * runqueue lock is not a problem.
1175 */ 1341 */
1176 current->time_slice = 1; 1342 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick(); 1343 scheduler_tick();
1179 local_irq_enable(); 1344 }
1180 preempt_enable(); 1345 local_irq_enable();
1181 } else 1346 put_cpu();
1182 local_irq_enable();
1183} 1347}
1184 1348
1185/* 1349/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1196 runqueue_t *rq, *this_rq; 1360 runqueue_t *rq, *this_rq;
1197 1361
1198 rq = task_rq_lock(p, &flags); 1362 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING); 1363 BUG_ON(p->state != TASK_RUNNING);
1364 this_cpu = smp_processor_id();
1365 cpu = task_cpu(p);
1203 1366
1204 /* 1367 /*
1205 * We decrease the sleep average of forking parents 1368 * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
1296} 1459}
1297 1460
1298/** 1461/**
1462 * prepare_task_switch - prepare to switch tasks
1463 * @rq: the runqueue preparing to switch
1464 * @next: the task we are going to switch to.
1465 *
1466 * This is called with the rq lock held and interrupts off. It must
1467 * be paired with a subsequent finish_task_switch after the context
1468 * switch.
1469 *
1470 * prepare_task_switch sets up locking and calls architecture specific
1471 * hooks.
1472 */
1473static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1474{
1475 prepare_lock_switch(rq, next);
1476 prepare_arch_switch(next);
1477}
1478
1479/**
1299 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1481 * @prev: the thread we just switched away from.
1301 * 1482 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1483 * finish_task_switch must be called after the context switch, paired
1303 * will unlock it along with doing any other architecture-specific cleanup 1484 * with a prepare_task_switch call before the context switch.
1304 * actions. 1485 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1486 * and do any other architecture-specific cleanup actions.
1305 * 1487 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1488 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1489 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1490 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1491 * details.)
1310 */ 1492 */
1311static inline void finish_task_switch(task_t *prev) 1493static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1312 __releases(rq->lock) 1494 __releases(rq->lock)
1313{ 1495{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1496 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1497 unsigned long prev_task_flags;
1317 1498
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
1329 * Manfred Spraul <manfred@colorfullife.com> 1510 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1511 */
1331 prev_task_flags = prev->flags; 1512 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1513 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev);
1333 if (mm) 1515 if (mm)
1334 mmdrop(mm); 1516 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1517 if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
1343asmlinkage void schedule_tail(task_t *prev) 1525asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1526 __releases(rq->lock)
1345{ 1527{
1346 finish_task_switch(prev); 1528 runqueue_t *rq = this_rq();
1347 1529 finish_task_switch(rq, prev);
1530#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1531 /* In this case, finish_task_switch does not reenable preemption */
1532 preempt_enable();
1533#endif
1348 if (current->set_child_tid) 1534 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1535 put_user(current->pid, current->set_child_tid);
1350} 1536}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1494} 1680}
1495 1681
1496/* 1682/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1683 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1684 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1685 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
1571} 1712}
1572 1713
1573/* 1714/*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1715 * sched_exec - execve() is a valuable balancing opportunity, because at
1575 * domain and try to migrate the task to the least loaded CPU. 1716 * this point the task has the smallest effective memory and cache footprint.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1717 */
1580void sched_exec(void) 1718void sched_exec(void)
1581{ 1719{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1720 int new_cpu, this_cpu = get_cpu();
1584 1721 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu(); 1722 put_cpu();
1723 if (new_cpu != this_cpu)
1724 sched_migrate_task(current, new_cpu);
1605} 1725}
1606 1726
1607/* 1727/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1752 */
1633static inline 1753static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1756{
1637 /* 1757 /*
1638 * We do not migrate tasks that are: 1758 * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1760 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1761 * 3) are cache-hot on their current CPU.
1642 */ 1762 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1763 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1764 return 0;
1765 *all_pinned = 0;
1766
1767 if (task_running(rq, p))
1768 return 0;
1647 1769
1648 /* 1770 /*
1649 * Aggressive migration if: 1771 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1772 * 1) task is cache cold, or
1651 * 2) too many balance attempts have failed. 1773 * 2) too many balance attempts have failed.
1652 */ 1774 */
1653 1775
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1776 if (sd->nr_balance_failed > sd->cache_nice_tries)
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1777 return 1;
1657 1778
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1779 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1780 return 0;
1660 return 1; 1781 return 1;
1661} 1782}
1662 1783
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1790 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1791static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1792 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1793 enum idle_type idle, int *all_pinned)
1673{ 1794{
1674 prio_array_t *array, *dst_array; 1795 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1796 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1797 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1798 task_t *tmp;
1678 1799
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1800 if (max_nr_move == 0)
1680 goto out; 1801 goto out;
1681 1802
1803 pinned = 1;
1804
1682 /* 1805 /*
1683 * We first consider expired tasks. Those will likely not be 1806 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1807 * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
1717 1840
1718 curr = curr->prev; 1841 curr = curr->prev;
1719 1842
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1843 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1844 if (curr != head)
1722 goto skip_queue; 1845 goto skip_queue;
1723 idx++; 1846 idx++;
@@ -1746,6 +1869,9 @@ out:
1746 * inside pull_task(). 1869 * inside pull_task().
1747 */ 1870 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1871 schedstat_add(sd, lb_gained[idle], pulled);
1872
1873 if (all_pinned)
1874 *all_pinned = pinned;
1749 return pulled; 1875 return pulled;
1750} 1876}
1751 1877
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1760{ 1886{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1889 int load_idx;
1763 1890
1764 max_load = this_load = total_load = total_pwr = 0; 1891 max_load = this_load = total_load = total_pwr = 0;
1892 if (idle == NOT_IDLE)
1893 load_idx = sd->busy_idx;
1894 else if (idle == NEWLY_IDLE)
1895 load_idx = sd->newidle_idx;
1896 else
1897 load_idx = sd->idle_idx;
1765 1898
1766 do { 1899 do {
1767 unsigned long load; 1900 unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1776 for_each_cpu_mask(i, group->cpumask) { 1909 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1910 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1911 if (local_group)
1779 load = target_load(i); 1912 load = target_load(i, load_idx);
1780 else 1913 else
1781 load = source_load(i); 1914 load = source_load(i, load_idx);
1782 1915
1783 avg_load += load; 1916 avg_load += load;
1784 } 1917 }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1925 if (local_group) {
1793 this_load = avg_load; 1926 this_load = avg_load;
1794 this = group; 1927 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1928 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1929 max_load = avg_load;
1798 busiest = group; 1930 busiest = group;
1799 } 1931 }
1800nextgroup:
1801 group = group->next; 1932 group = group->next;
1802 } while (group != sd->groups); 1933 } while (group != sd->groups);
1803 1934
@@ -1870,15 +2001,9 @@ nextgroup:
1870 2001
1871 /* Get rid of the scaling factor, rounding down as we divide */ 2002 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2003 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest; 2004 return busiest;
1875 2005
1876out_balanced: 2006out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882 2007
1883 *imbalance = 0; 2008 *imbalance = 0;
1884 return NULL; 2009 return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1894 int i; 2019 int i;
1895 2020
1896 for_each_cpu_mask(i, group->cpumask) { 2021 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 2022 load = source_load(i, 0);
1898 2023
1899 if (load > max_load) { 2024 if (load > max_load) {
1900 max_load = load; 2025 max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1906} 2031}
1907 2032
1908/* 2033/*
2034 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2035 * so long as it is large enough.
2036 */
2037#define MAX_PINNED_INTERVAL 512
2038
2039/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2040 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 2041 * tasks if there is an imbalance.
1911 * 2042 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 2048 struct sched_group *group;
1918 runqueue_t *busiest; 2049 runqueue_t *busiest;
1919 unsigned long imbalance; 2050 unsigned long imbalance;
1920 int nr_moved; 2051 int nr_moved, all_pinned = 0;
2052 int active_balance = 0;
1921 2053
1922 spin_lock(&this_rq->lock); 2054 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 2055 schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1934 goto out_balanced; 2066 goto out_balanced;
1935 } 2067 }
1936 2068
1937 /* 2069 BUG_ON(busiest == this_rq);
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946 2070
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 2071 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 2072
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 2080 */
1957 double_lock_balance(this_rq, busiest); 2081 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2082 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 2083 imbalance, sd, idle,
2084 &all_pinned);
1960 spin_unlock(&busiest->lock); 2085 spin_unlock(&busiest->lock);
2086
2087 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned))
2089 goto out_balanced;
1961 } 2090 }
2091
1962 spin_unlock(&this_rq->lock); 2092 spin_unlock(&this_rq->lock);
1963 2093
1964 if (!nr_moved) { 2094 if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 2096 sd->nr_balance_failed++;
1967 2097
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 2099
1971 spin_lock(&busiest->lock); 2100 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 2101 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 2102 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 2103 busiest->push_cpu = this_cpu;
1975 wake = 1; 2104 active_balance = 1;
1976 } 2105 }
1977 spin_unlock(&busiest->lock); 2106 spin_unlock(&busiest->lock);
1978 if (wake) 2107 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 2108 wake_up_process(busiest->migration_thread);
1980 2109
1981 /* 2110 /*
1982 * We've kicked active balancing, reset the failure 2111 * We've kicked active balancing, reset the failure
1983 * counter. 2112 * counter.
1984 */ 2113 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 2114 sd->nr_balance_failed = sd->cache_nice_tries+1;
1986 } 2115 }
1987 2116 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2117 sd->nr_balance_failed = 0;
1996 2118
2119 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2120 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2121 sd->balance_interval = sd->min_interval;
2122 } else {
2123 /*
2124 * If we've begun active balancing, start to back off. This
2125 * case may not be covered by the all_pinned logic if there
2126 * is only 1 task on the busy runqueue (because we don't call
2127 * move_tasks).
2128 */
2129 if (sd->balance_interval < sd->max_interval)
2130 sd->balance_interval *= 2;
1999 } 2131 }
2000 2132
2001 return nr_moved; 2133 return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
2005 2137
2006 schedstat_inc(sd, lb_balanced[idle]); 2138 schedstat_inc(sd, lb_balanced[idle]);
2007 2139
2140 sd->nr_balance_failed = 0;
2008 /* tune up the balancing interval */ 2141 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval))
2010 sd->balance_interval *= 2; 2144 sd->balance_interval *= 2;
2011 2145
2012 return 0; 2146 return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2166 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2168 goto out_balanced;
2036 } 2169 }
2037 2170
2038 busiest = find_busiest_queue(group); 2171 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2172 if (!busiest) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2173 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2174 goto out_balanced;
2043 } 2175 }
2044 2176
2177 BUG_ON(busiest == this_rq);
2178
2045 /* Attempt to move tasks */ 2179 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2180 double_lock_balance(this_rq, busiest);
2047 2181
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2183 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2184 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2185 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else
2188 sd->nr_balance_failed = 0;
2053 2189
2054 spin_unlock(&busiest->lock); 2190 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved; 2191 return nr_moved;
2192
2193out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2195 sd->nr_balance_failed = 0;
2196 return 0;
2058} 2197}
2059 2198
2060/* 2199/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2225static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{ 2226{
2088 struct sched_domain *sd; 2227 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2228 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2229 int target_cpu = busiest_rq->push_cpu;
2092 int cpu; 2230
2231 if (busiest_rq->nr_running <= 1)
2232 /* no task to move */
2233 return;
2234
2235 target_rq = cpu_rq(target_cpu);
2093 2236
2094 /* 2237 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2238 * This condition is "impossible", if it occurs
2096 * domains with SD_LOAD_BALANCE set. 2239 * we need to fix it. Originally reported by
2240 * Bjorn Helgaas on a 128-cpu setup.
2097 */ 2241 */
2098 visited_cpus = CPU_MASK_NONE; 2242 BUG_ON(busiest_rq == target_rq);
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103 2243
2104 schedstat_inc(sd, alb_cnt); 2244 /* move a task from busiest_rq to target_rq */
2245 double_lock_balance(busiest_rq, target_rq);
2105 2246
2106 cpu_group = sd->groups; 2247 /* Search for an sd spanning us and the target CPU. */
2107 do { 2248 for_each_domain(target_cpu, sd)
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2249 if ((sd->flags & SD_LOAD_BALANCE) &&
2109 if (busiest_rq->nr_running <= 1) 2250 cpu_isset(busiest_cpu, sd->span))
2110 /* no more tasks left to move */ 2251 break;
2111 return; 2252
2112 if (cpu_isset(cpu, visited_cpus)) 2253 if (unlikely(sd == NULL))
2113 continue; 2254 goto out;
2114 cpu_set(cpu, visited_cpus); 2255
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2256 schedstat_inc(sd, alb_cnt);
2116 continue; 2257
2117 2258 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2118 target_rq = cpu_rq(cpu); 2259 schedstat_inc(sd, alb_pushed);
2119 /* 2260 else
2120 * This condition is "impossible", if it occurs 2261 schedstat_inc(sd, alb_failed);
2121 * we need to fix it. Originally reported by 2262out:
2122 * Bjorn Helgaas on a 128-cpu setup. 2263 spin_unlock(&target_rq->lock);
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139} 2264}
2140 2265
2141/* 2266/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2156 unsigned long old_load, this_load; 2281 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2282 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2283 struct sched_domain *sd;
2284 int i;
2159 2285
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2286 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2287 /* Update our load */
2164 * Round up the averaging division if load is increasing. This 2288 for (i = 0; i < 3; i++) {
2165 * prevents us from getting stuck on 9 if the load is 10, for 2289 unsigned long new_load = this_load;
2166 * example. 2290 int scale = 1 << i;
2167 */ 2291 old_load = this_rq->cpu_load[i];
2168 if (this_load > old_load) 2292 /*
2169 old_load++; 2293 * Round up the averaging division if load is increasing. This
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2294 * prevents us from getting stuck on 9 if the load is 10, for
2295 * example.
2296 */
2297 if (new_load > old_load)
2298 new_load += scale-1;
2299 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2300 }
2171 2301
2172 for_each_domain(this_cpu, sd) { 2302 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2303 unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
2447#ifdef CONFIG_SCHED_SMT 2577#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{ 2579{
2450 struct sched_domain *sd = this_rq->sd; 2580 struct sched_domain *tmp, *sd = NULL;
2451 cpumask_t sibling_map; 2581 cpumask_t sibling_map;
2452 int i; 2582 int i;
2453 2583
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2584 for_each_domain(this_cpu, tmp)
2585 if (tmp->flags & SD_SHARE_CPUPOWER)
2586 sd = tmp;
2587
2588 if (!sd)
2455 return; 2589 return;
2456 2590
2457 /* 2591 /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2492 2626
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{ 2628{
2495 struct sched_domain *sd = this_rq->sd; 2629 struct sched_domain *tmp, *sd = NULL;
2496 cpumask_t sibling_map; 2630 cpumask_t sibling_map;
2497 prio_array_t *array; 2631 prio_array_t *array;
2498 int ret = 0, i; 2632 int ret = 0, i;
2499 task_t *p; 2633 task_t *p;
2500 2634
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2635 for_each_domain(this_cpu, tmp)
2636 if (tmp->flags & SD_SHARE_CPUPOWER)
2637 sd = tmp;
2638
2639 if (!sd)
2502 return 0; 2640 return 0;
2503 2641
2504 /* 2642 /*
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
2613 struct list_head *queue; 2751 struct list_head *queue;
2614 unsigned long long now; 2752 unsigned long long now;
2615 unsigned long run_time; 2753 unsigned long run_time;
2616 int cpu, idx; 2754 int cpu, idx, new_prio;
2617 2755
2618 /* 2756 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2757 * Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2873 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2874
2737 array = next->array; 2875 array = next->array;
2738 dequeue_task(next, array); 2876 new_prio = recalc_task_prio(next, next->timestamp + delta);
2739 recalc_task_prio(next, next->timestamp + delta); 2877
2740 enqueue_task(next, array); 2878 if (unlikely(next->prio != new_prio)) {
2879 dequeue_task(next, array);
2880 next->prio = new_prio;
2881 enqueue_task(next, array);
2882 } else
2883 requeue_task(next, array);
2741 } 2884 }
2742 next->activated = 0; 2885 next->activated = 0;
2743switch_tasks: 2886switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
2761 rq->curr = next; 2904 rq->curr = next;
2762 ++*switch_count; 2905 ++*switch_count;
2763 2906
2764 prepare_arch_switch(rq, next); 2907 prepare_task_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2908 prev = context_switch(rq, prev, next);
2766 barrier(); 2909 barrier();
2767 2910 /*
2768 finish_task_switch(prev); 2911 * this_rq must be evaluated again because prev may have moved
2912 * CPUs since it called schedule(), thus the 'rq' on its stack
2913 * frame will be invalid.
2914 */
2915 finish_task_switch(this_rq(), prev);
2769 } else 2916 } else
2770 spin_unlock_irq(&rq->lock); 2917 spin_unlock_irq(&rq->lock);
2771 2918
@@ -3384,13 +3531,24 @@ recheck:
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3531 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3532 return -EINVAL;
3386 3533
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3534 /*
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3535 * Allow unprivileged RT tasks to decrease priority:
3389 !capable(CAP_SYS_NICE)) 3536 */
3390 return -EPERM; 3537 if (!capable(CAP_SYS_NICE)) {
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3538 /* can't change policy */
3392 !capable(CAP_SYS_NICE)) 3539 if (policy != p->policy)
3393 return -EPERM; 3540 return -EPERM;
3541 /* can't increase priority */
3542 if (policy != SCHED_NORMAL &&
3543 param->sched_priority > p->rt_priority &&
3544 param->sched_priority >
3545 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3546 return -EPERM;
3547 /* can't change other user's priorities */
3548 if ((current->euid != p->euid) &&
3549 (current->euid != p->uid))
3550 return -EPERM;
3551 }
3394 3552
3395 retval = security_task_setscheduler(p, policy, param); 3553 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3554 if (retval)
@@ -4030,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4030 4188
4031 spin_lock_irqsave(&rq->lock, flags); 4189 spin_lock_irqsave(&rq->lock, flags);
4032 rq->curr = rq->idle = idle; 4190 rq->curr = rq->idle = idle;
4191#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4192 idle->oncpu = 1;
4193#endif
4033 set_tsk_need_resched(idle); 4194 set_tsk_need_resched(idle);
4034 spin_unlock_irqrestore(&rq->lock, flags); 4195 spin_unlock_irqrestore(&rq->lock, flags);
4035 4196
@@ -4174,8 +4335,7 @@ static int migration_thread(void * data)
4174 struct list_head *head; 4335 struct list_head *head;
4175 migration_req_t *req; 4336 migration_req_t *req;
4176 4337
4177 if (current->flags & PF_FREEZE) 4338 try_to_freeze();
4178 refrigerator(PF_FREEZE);
4179 4339
4180 spin_lock_irq(&rq->lock); 4340 spin_lock_irq(&rq->lock);
4181 4341
@@ -4200,17 +4360,9 @@ static int migration_thread(void * data)
4200 req = list_entry(head->next, migration_req_t, list); 4360 req = list_entry(head->next, migration_req_t, list);
4201 list_del_init(head->next); 4361 list_del_init(head->next);
4202 4362
4203 if (req->type == REQ_MOVE_TASK) { 4363 spin_unlock(&rq->lock);
4204 spin_unlock(&rq->lock); 4364 __migrate_task(req->task, cpu, req->dest_cpu);
4205 __migrate_task(req->task, cpu, req->dest_cpu); 4365 local_irq_enable();
4206 local_irq_enable();
4207 } else if (req->type == REQ_SET_DOMAIN) {
4208 rq->sd = req->sd;
4209 spin_unlock_irq(&rq->lock);
4210 } else {
4211 spin_unlock_irq(&rq->lock);
4212 WARN_ON(1);
4213 }
4214 4366
4215 complete(&req->done); 4367 complete(&req->done);
4216 } 4368 }
@@ -4441,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4441 migration_req_t *req; 4593 migration_req_t *req;
4442 req = list_entry(rq->migration_queue.next, 4594 req = list_entry(rq->migration_queue.next,
4443 migration_req_t, list); 4595 migration_req_t, list);
4444 BUG_ON(req->type != REQ_MOVE_TASK);
4445 list_del_init(&req->list); 4596 list_del_init(&req->list);
4446 complete(&req->done); 4597 complete(&req->done);
4447 } 4598 }
@@ -4472,12 +4623,17 @@ int __init migration_init(void)
4472#endif 4623#endif
4473 4624
4474#ifdef CONFIG_SMP 4625#ifdef CONFIG_SMP
4475#define SCHED_DOMAIN_DEBUG 4626#undef SCHED_DOMAIN_DEBUG
4476#ifdef SCHED_DOMAIN_DEBUG 4627#ifdef SCHED_DOMAIN_DEBUG
4477static void sched_domain_debug(struct sched_domain *sd, int cpu) 4628static void sched_domain_debug(struct sched_domain *sd, int cpu)
4478{ 4629{
4479 int level = 0; 4630 int level = 0;
4480 4631
4632 if (!sd) {
4633 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4634 return;
4635 }
4636
4481 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4637 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4482 4638
4483 do { 4639 do {
@@ -4560,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4560#define sched_domain_debug(sd, cpu) {} 4716#define sched_domain_debug(sd, cpu) {}
4561#endif 4717#endif
4562 4718
4719static int sd_degenerate(struct sched_domain *sd)
4720{
4721 if (cpus_weight(sd->span) == 1)
4722 return 1;
4723
4724 /* Following flags need at least 2 groups */
4725 if (sd->flags & (SD_LOAD_BALANCE |
4726 SD_BALANCE_NEWIDLE |
4727 SD_BALANCE_FORK |
4728 SD_BALANCE_EXEC)) {
4729 if (sd->groups != sd->groups->next)
4730 return 0;
4731 }
4732
4733 /* Following flags don't use groups */
4734 if (sd->flags & (SD_WAKE_IDLE |
4735 SD_WAKE_AFFINE |
4736 SD_WAKE_BALANCE))
4737 return 0;
4738
4739 return 1;
4740}
4741
4742static int sd_parent_degenerate(struct sched_domain *sd,
4743 struct sched_domain *parent)
4744{
4745 unsigned long cflags = sd->flags, pflags = parent->flags;
4746
4747 if (sd_degenerate(parent))
4748 return 1;
4749
4750 if (!cpus_equal(sd->span, parent->span))
4751 return 0;
4752
4753 /* Does parent contain flags not in child? */
4754 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4755 if (cflags & SD_WAKE_AFFINE)
4756 pflags &= ~SD_WAKE_BALANCE;
4757 /* Flags needing groups don't count if only 1 group in parent */
4758 if (parent->groups == parent->groups->next) {
4759 pflags &= ~(SD_LOAD_BALANCE |
4760 SD_BALANCE_NEWIDLE |
4761 SD_BALANCE_FORK |
4762 SD_BALANCE_EXEC);
4763 }
4764 if (~cflags & pflags)
4765 return 0;
4766
4767 return 1;
4768}
4769
4563/* 4770/*
4564 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4771 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4565 * hold the hotplug lock. 4772 * hold the hotplug lock.
4566 */ 4773 */
4567void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4774void cpu_attach_domain(struct sched_domain *sd, int cpu)
4568{ 4775{
4569 migration_req_t req;
4570 unsigned long flags;
4571 runqueue_t *rq = cpu_rq(cpu); 4776 runqueue_t *rq = cpu_rq(cpu);
4572 int local = 1; 4777 struct sched_domain *tmp;
4573
4574 sched_domain_debug(sd, cpu);
4575 4778
4576 spin_lock_irqsave(&rq->lock, flags); 4779 /* Remove the sched domains which do not contribute to scheduling. */
4577 4780 for (tmp = sd; tmp; tmp = tmp->parent) {
4578 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4781 struct sched_domain *parent = tmp->parent;
4579 rq->sd = sd; 4782 if (!parent)
4580 } else { 4783 break;
4581 init_completion(&req.done); 4784 if (sd_parent_degenerate(tmp, parent))
4582 req.type = REQ_SET_DOMAIN; 4785 tmp->parent = parent->parent;
4583 req.sd = sd;
4584 list_add(&req.list, &rq->migration_queue);
4585 local = 0;
4586 } 4786 }
4587 4787
4588 spin_unlock_irqrestore(&rq->lock, flags); 4788 if (sd && sd_degenerate(sd))
4789 sd = sd->parent;
4589 4790
4590 if (!local) { 4791 sched_domain_debug(sd, cpu);
4591 wake_up_process(rq->migration_thread); 4792
4592 wait_for_completion(&req.done); 4793 rcu_assign_pointer(rq->sd, sd);
4593 }
4594} 4794}
4595 4795
4596/* cpus with isolated domains */ 4796/* cpus with isolated domains */
@@ -4622,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4622 * covered by the given span, and will set each group's ->cpumask correctly, 4822 * covered by the given span, and will set each group's ->cpumask correctly,
4623 * and ->cpu_power to 0. 4823 * and ->cpu_power to 0.
4624 */ 4824 */
4625void __devinit init_sched_build_groups(struct sched_group groups[], 4825void init_sched_build_groups(struct sched_group groups[],
4626 cpumask_t span, int (*group_fn)(int cpu)) 4826 cpumask_t span, int (*group_fn)(int cpu))
4627{ 4827{
4628 struct sched_group *first = NULL, *last = NULL; 4828 struct sched_group *first = NULL, *last = NULL;
@@ -4658,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4658 4858
4659 4859
4660#ifdef ARCH_HAS_SCHED_DOMAIN 4860#ifdef ARCH_HAS_SCHED_DOMAIN
4661extern void __devinit arch_init_sched_domains(void); 4861extern void build_sched_domains(const cpumask_t *cpu_map);
4662extern void __devinit arch_destroy_sched_domains(void); 4862extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4863extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4663#else 4864#else
4664#ifdef CONFIG_SCHED_SMT 4865#ifdef CONFIG_SCHED_SMT
4665static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4866static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4666static struct sched_group sched_group_cpus[NR_CPUS]; 4867static struct sched_group sched_group_cpus[NR_CPUS];
4667static int __devinit cpu_to_cpu_group(int cpu) 4868static int cpu_to_cpu_group(int cpu)
4668{ 4869{
4669 return cpu; 4870 return cpu;
4670} 4871}
@@ -4672,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4672 4873
4673static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4874static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4674static struct sched_group sched_group_phys[NR_CPUS]; 4875static struct sched_group sched_group_phys[NR_CPUS];
4675static int __devinit cpu_to_phys_group(int cpu) 4876static int cpu_to_phys_group(int cpu)
4676{ 4877{
4677#ifdef CONFIG_SCHED_SMT 4878#ifdef CONFIG_SCHED_SMT
4678 return first_cpu(cpu_sibling_map[cpu]); 4879 return first_cpu(cpu_sibling_map[cpu]);
@@ -4685,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4685 4886
4686static DEFINE_PER_CPU(struct sched_domain, node_domains); 4887static DEFINE_PER_CPU(struct sched_domain, node_domains);
4687static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4888static struct sched_group sched_group_nodes[MAX_NUMNODES];
4688static int __devinit cpu_to_node_group(int cpu) 4889static int cpu_to_node_group(int cpu)
4689{ 4890{
4690 return cpu_to_node(cpu); 4891 return cpu_to_node(cpu);
4691} 4892}
@@ -4716,39 +4917,28 @@ static void check_sibling_maps(void)
4716#endif 4917#endif
4717 4918
4718/* 4919/*
4719 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4920 * Build sched domains for a given set of cpus and attach the sched domains
4921 * to the individual cpus
4720 */ 4922 */
4721static void __devinit arch_init_sched_domains(void) 4923static void build_sched_domains(const cpumask_t *cpu_map)
4722{ 4924{
4723 int i; 4925 int i;
4724 cpumask_t cpu_default_map;
4725
4726#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727 check_sibling_maps();
4728#endif
4729 /*
4730 * Setup mask for cpus without special case scheduling requirements.
4731 * For now this just excludes isolated cpus, but could be used to
4732 * exclude other special cases in the future.
4733 */
4734 cpus_complement(cpu_default_map, cpu_isolated_map);
4735 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4736 4926
4737 /* 4927 /*
4738 * Set up domains. Isolated domains just stay on the dummy domain. 4928 * Set up domains for cpus specified by the cpu_map.
4739 */ 4929 */
4740 for_each_cpu_mask(i, cpu_default_map) { 4930 for_each_cpu_mask(i, *cpu_map) {
4741 int group; 4931 int group;
4742 struct sched_domain *sd = NULL, *p; 4932 struct sched_domain *sd = NULL, *p;
4743 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4933 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4744 4934
4745 cpus_and(nodemask, nodemask, cpu_default_map); 4935 cpus_and(nodemask, nodemask, *cpu_map);
4746 4936
4747#ifdef CONFIG_NUMA 4937#ifdef CONFIG_NUMA
4748 sd = &per_cpu(node_domains, i); 4938 sd = &per_cpu(node_domains, i);
4749 group = cpu_to_node_group(i); 4939 group = cpu_to_node_group(i);
4750 *sd = SD_NODE_INIT; 4940 *sd = SD_NODE_INIT;
4751 sd->span = cpu_default_map; 4941 sd->span = *cpu_map;
4752 sd->groups = &sched_group_nodes[group]; 4942 sd->groups = &sched_group_nodes[group];
4753#endif 4943#endif
4754 4944
@@ -4766,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
4766 group = cpu_to_cpu_group(i); 4956 group = cpu_to_cpu_group(i);
4767 *sd = SD_SIBLING_INIT; 4957 *sd = SD_SIBLING_INIT;
4768 sd->span = cpu_sibling_map[i]; 4958 sd->span = cpu_sibling_map[i];
4769 cpus_and(sd->span, sd->span, cpu_default_map); 4959 cpus_and(sd->span, sd->span, *cpu_map);
4770 sd->parent = p; 4960 sd->parent = p;
4771 sd->groups = &sched_group_cpus[group]; 4961 sd->groups = &sched_group_cpus[group];
4772#endif 4962#endif
@@ -4776,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
4776 /* Set up CPU (sibling) groups */ 4966 /* Set up CPU (sibling) groups */
4777 for_each_online_cpu(i) { 4967 for_each_online_cpu(i) {
4778 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4968 cpumask_t this_sibling_map = cpu_sibling_map[i];
4779 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4969 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4780 if (i != first_cpu(this_sibling_map)) 4970 if (i != first_cpu(this_sibling_map))
4781 continue; 4971 continue;
4782 4972
@@ -4789,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
4789 for (i = 0; i < MAX_NUMNODES; i++) { 4979 for (i = 0; i < MAX_NUMNODES; i++) {
4790 cpumask_t nodemask = node_to_cpumask(i); 4980 cpumask_t nodemask = node_to_cpumask(i);
4791 4981
4792 cpus_and(nodemask, nodemask, cpu_default_map); 4982 cpus_and(nodemask, nodemask, *cpu_map);
4793 if (cpus_empty(nodemask)) 4983 if (cpus_empty(nodemask))
4794 continue; 4984 continue;
4795 4985
@@ -4799,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
4799 4989
4800#ifdef CONFIG_NUMA 4990#ifdef CONFIG_NUMA
4801 /* Set up node groups */ 4991 /* Set up node groups */
4802 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4992 init_sched_build_groups(sched_group_nodes, *cpu_map,
4803 &cpu_to_node_group); 4993 &cpu_to_node_group);
4804#endif 4994#endif
4805 4995
4806 /* Calculate CPU power for physical packages and nodes */ 4996 /* Calculate CPU power for physical packages and nodes */
4807 for_each_cpu_mask(i, cpu_default_map) { 4997 for_each_cpu_mask(i, *cpu_map) {
4808 int power; 4998 int power;
4809 struct sched_domain *sd; 4999 struct sched_domain *sd;
4810#ifdef CONFIG_SCHED_SMT 5000#ifdef CONFIG_SCHED_SMT
@@ -4828,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
4828 } 5018 }
4829 5019
4830 /* Attach the domains */ 5020 /* Attach the domains */
4831 for_each_online_cpu(i) { 5021 for_each_cpu_mask(i, *cpu_map) {
4832 struct sched_domain *sd; 5022 struct sched_domain *sd;
4833#ifdef CONFIG_SCHED_SMT 5023#ifdef CONFIG_SCHED_SMT
4834 sd = &per_cpu(cpu_domains, i); 5024 sd = &per_cpu(cpu_domains, i);
@@ -4838,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
4838 cpu_attach_domain(sd, i); 5028 cpu_attach_domain(sd, i);
4839 } 5029 }
4840} 5030}
5031/*
5032 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5033 */
5034static void arch_init_sched_domains(cpumask_t *cpu_map)
5035{
5036 cpumask_t cpu_default_map;
5037
5038#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5039 check_sibling_maps();
5040#endif
5041 /*
5042 * Setup mask for cpus without special case scheduling requirements.
5043 * For now this just excludes isolated cpus, but could be used to
5044 * exclude other special cases in the future.
5045 */
5046 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5047
5048 build_sched_domains(&cpu_default_map);
5049}
4841 5050
4842#ifdef CONFIG_HOTPLUG_CPU 5051static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
4843static void __devinit arch_destroy_sched_domains(void)
4844{ 5052{
4845 /* Do nothing: everything is statically allocated. */ 5053 /* Do nothing: everything is statically allocated. */
4846} 5054}
4847#endif
4848 5055
4849#endif /* ARCH_HAS_SCHED_DOMAIN */ 5056#endif /* ARCH_HAS_SCHED_DOMAIN */
4850 5057
4851/* 5058/*
4852 * Initial dummy domain for early boot and for hotplug cpu. Being static, 5059 * Detach sched domains from a group of cpus specified in cpu_map
4853 * it is initialized to zero, so all balancing flags are cleared which is 5060 * These cpus will now be attached to the NULL domain
4854 * what we want.
4855 */ 5061 */
4856static struct sched_domain sched_domain_dummy; 5062static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5063{
5064 int i;
5065
5066 for_each_cpu_mask(i, *cpu_map)
5067 cpu_attach_domain(NULL, i);
5068 synchronize_sched();
5069 arch_destroy_sched_domains(cpu_map);
5070}
5071
5072/*
5073 * Partition sched domains as specified by the cpumasks below.
5074 * This attaches all cpus from the cpumasks to the NULL domain,
5075 * waits for a RCU quiescent period, recalculates sched
5076 * domain information and then attaches them back to the
5077 * correct sched domains
5078 * Call with hotplug lock held
5079 */
5080void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5081{
5082 cpumask_t change_map;
5083
5084 cpus_and(*partition1, *partition1, cpu_online_map);
5085 cpus_and(*partition2, *partition2, cpu_online_map);
5086 cpus_or(change_map, *partition1, *partition2);
5087
5088 /* Detach sched domains from all of the affected cpus */
5089 detach_destroy_domains(&change_map);
5090 if (!cpus_empty(*partition1))
5091 build_sched_domains(partition1);
5092 if (!cpus_empty(*partition2))
5093 build_sched_domains(partition2);
5094}
4857 5095
4858#ifdef CONFIG_HOTPLUG_CPU 5096#ifdef CONFIG_HOTPLUG_CPU
4859/* 5097/*
4860 * Force a reinitialization of the sched domains hierarchy. The domains 5098 * Force a reinitialization of the sched domains hierarchy. The domains
4861 * and groups cannot be updated in place without racing with the balancing 5099 * and groups cannot be updated in place without racing with the balancing
4862 * code, so we temporarily attach all running cpus to a "dummy" domain 5100 * code, so we temporarily attach all running cpus to the NULL domain
4863 * which will prevent rebalancing while the sched domains are recalculated. 5101 * which will prevent rebalancing while the sched domains are recalculated.
4864 */ 5102 */
4865static int update_sched_domains(struct notifier_block *nfb, 5103static int update_sched_domains(struct notifier_block *nfb,
4866 unsigned long action, void *hcpu) 5104 unsigned long action, void *hcpu)
4867{ 5105{
4868 int i;
4869
4870 switch (action) { 5106 switch (action) {
4871 case CPU_UP_PREPARE: 5107 case CPU_UP_PREPARE:
4872 case CPU_DOWN_PREPARE: 5108 case CPU_DOWN_PREPARE:
4873 for_each_online_cpu(i) 5109 detach_destroy_domains(&cpu_online_map);
4874 cpu_attach_domain(&sched_domain_dummy, i);
4875 arch_destroy_sched_domains();
4876 return NOTIFY_OK; 5110 return NOTIFY_OK;
4877 5111
4878 case CPU_UP_CANCELED: 5112 case CPU_UP_CANCELED:
@@ -4888,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4888 } 5122 }
4889 5123
4890 /* The hotplug lock is already held by cpu_up/cpu_down */ 5124 /* The hotplug lock is already held by cpu_up/cpu_down */
4891 arch_init_sched_domains(); 5125 arch_init_sched_domains(&cpu_online_map);
4892 5126
4893 return NOTIFY_OK; 5127 return NOTIFY_OK;
4894} 5128}
@@ -4897,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4897void __init sched_init_smp(void) 5131void __init sched_init_smp(void)
4898{ 5132{
4899 lock_cpu_hotplug(); 5133 lock_cpu_hotplug();
4900 arch_init_sched_domains(); 5134 arch_init_sched_domains(&cpu_online_map);
4901 unlock_cpu_hotplug(); 5135 unlock_cpu_hotplug();
4902 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5136 /* XXX: Theoretical race here - CPU may be hotplugged now */
4903 hotcpu_notifier(update_sched_domains, 0); 5137 hotcpu_notifier(update_sched_domains, 0);
@@ -4927,13 +5161,15 @@ void __init sched_init(void)
4927 5161
4928 rq = cpu_rq(i); 5162 rq = cpu_rq(i);
4929 spin_lock_init(&rq->lock); 5163 spin_lock_init(&rq->lock);
5164 rq->nr_running = 0;
4930 rq->active = rq->arrays; 5165 rq->active = rq->arrays;
4931 rq->expired = rq->arrays + 1; 5166 rq->expired = rq->arrays + 1;
4932 rq->best_expired_prio = MAX_PRIO; 5167 rq->best_expired_prio = MAX_PRIO;
4933 5168
4934#ifdef CONFIG_SMP 5169#ifdef CONFIG_SMP
4935 rq->sd = &sched_domain_dummy; 5170 rq->sd = NULL;
4936 rq->cpu_load = 0; 5171 for (j = 1; j < 3; j++)
5172 rq->cpu_load[j] = 0;
4937 rq->active_balance = 0; 5173 rq->active_balance = 0;
4938 rq->push_cpu = 0; 5174 rq->push_cpu = 0;
4939 rq->migration_thread = NULL; 5175 rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index d1258729a5f9..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,7 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
213fastcall void recalc_sigpending_tsk(struct task_struct *t) 213fastcall void recalc_sigpending_tsk(struct task_struct *t)
214{ 214{
215 if (t->signal->group_stop_count > 0 || 215 if (t->signal->group_stop_count > 0 ||
216 (t->flags & PF_FREEZE) || 216 (freezing(t)) ||
217 PENDING(&t->pending, &t->blocked) || 217 PENDING(&t->pending, &t->blocked) ||
218 PENDING(&t->signal->shared_pending, &t->blocked)) 218 PENDING(&t->signal->shared_pending, &t->blocked))
219 set_tsk_thread_flag(t, TIF_SIGPENDING); 219 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -2231,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2231 current->state = TASK_INTERRUPTIBLE; 2231 current->state = TASK_INTERRUPTIBLE;
2232 timeout = schedule_timeout(timeout); 2232 timeout = schedule_timeout(timeout);
2233 2233
2234 if (current->flags & PF_FREEZE) 2234 try_to_freeze();
2235 refrigerator(PF_FREEZE);
2236 spin_lock_irq(&current->sighand->siglock); 2235 spin_lock_irq(&current->sighand->siglock);
2237 sig = dequeue_signal(current, &these, &info); 2236 sig = dequeue_signal(current, &these, &info);
2238 current->blocked = current->real_blocked; 2237 current->blocked = current->real_blocked;
diff --git a/kernel/sys.c b/kernel/sys.c
index da24bc1292db..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/highuid.h> 17#include <linux/highuid.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/kexec.h>
19#include <linux/workqueue.h> 21#include <linux/workqueue.h>
20#include <linux/device.h> 22#include <linux/device.h>
21#include <linux/key.h> 23#include <linux/key.h>
@@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
405 case LINUX_REBOOT_CMD_HALT: 407 case LINUX_REBOOT_CMD_HALT:
406 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 408 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
407 system_state = SYSTEM_HALT; 409 system_state = SYSTEM_HALT;
410 device_suspend(PMSG_SUSPEND);
408 device_shutdown(); 411 device_shutdown();
409 printk(KERN_EMERG "System halted.\n"); 412 printk(KERN_EMERG "System halted.\n");
410 machine_halt(); 413 machine_halt();
@@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
415 case LINUX_REBOOT_CMD_POWER_OFF: 418 case LINUX_REBOOT_CMD_POWER_OFF:
416 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 419 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
417 system_state = SYSTEM_POWER_OFF; 420 system_state = SYSTEM_POWER_OFF;
421 device_suspend(PMSG_SUSPEND);
418 device_shutdown(); 422 device_shutdown();
419 printk(KERN_EMERG "Power down.\n"); 423 printk(KERN_EMERG "Power down.\n");
420 machine_power_off(); 424 machine_power_off();
@@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
431 435
432 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); 436 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
433 system_state = SYSTEM_RESTART; 437 system_state = SYSTEM_RESTART;
438 device_suspend(PMSG_FREEZE);
434 device_shutdown(); 439 device_shutdown();
435 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); 440 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
436 machine_restart(buffer); 441 machine_restart(buffer);
437 break; 442 break;
438 443
444#ifdef CONFIG_KEXEC
445 case LINUX_REBOOT_CMD_KEXEC:
446 {
447 struct kimage *image;
448 image = xchg(&kexec_image, 0);
449 if (!image) {
450 unlock_kernel();
451 return -EINVAL;
452 }
453 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
454 system_state = SYSTEM_RESTART;
455 device_shutdown();
456 printk(KERN_EMERG "Starting new kernel\n");
457 machine_shutdown();
458 machine_kexec(image);
459 break;
460 }
461#endif
439#ifdef CONFIG_SOFTWARE_SUSPEND 462#ifdef CONFIG_SOFTWARE_SUSPEND
440 case LINUX_REBOOT_CMD_SW_SUSPEND: 463 case LINUX_REBOOT_CMD_SW_SUSPEND:
441 { 464 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff); 20cond_syscall(sys_swapoff);
21cond_syscall(sys_kexec_load);
22cond_syscall(compat_sys_kexec_load);
21cond_syscall(sys_init_module); 23cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module); 24cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair); 25cond_syscall(sys_socketpair);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 24a4d12d5aa9..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1000,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1000 int error = parse_table(name, nlen, oldval, oldlenp, 1000 int error = parse_table(name, nlen, oldval, oldlenp,
1001 newval, newlen, head->ctl_table, 1001 newval, newlen, head->ctl_table,
1002 &context); 1002 &context);
1003 if (context) 1003 kfree(context);
1004 kfree(context);
1005 if (error != -ENOTDIR) 1004 if (error != -ENOTDIR)
1006 return error; 1005 return error;
1007 tmp = tmp->next; 1006 tmp = tmp->next;
diff --git a/kernel/timer.c b/kernel/timer.c
index 51ff917c9590..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1597,7 +1597,7 @@ void msleep(unsigned int msecs)
1597EXPORT_SYMBOL(msleep); 1597EXPORT_SYMBOL(msleep);
1598 1598
1599/** 1599/**
1600 * msleep_interruptible - sleep waiting for waitqueue interruptions 1600 * msleep_interruptible - sleep waiting for signals
1601 * @msecs: Time in milliseconds to sleep for 1601 * @msecs: Time in milliseconds to sleep for
1602 */ 1602 */
1603unsigned long msleep_interruptible(unsigned int msecs) 1603unsigned long msleep_interruptible(unsigned int msecs)