aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz46
-rw-r--r--kernel/Kconfig.preempt65
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c97
-rw-r--r--kernel/crash_dump.c52
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c33
-rw-r--r--kernel/irq/autoprobe.c9
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/spurious.c115
-rw-r--r--kernel/itimer.c8
-rw-r--r--kernel/kexec.c1063
-rw-r--r--kernel/kmod.c17
-rw-r--r--kernel/kprobes.c360
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/module.c99
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c34
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/disk.c35
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c89
-rw-r--r--kernel/power/swsusp.c95
-rw-r--r--kernel/printk.c15
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c1073
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c133
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/timer.c353
37 files changed, 3049 insertions, 907 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
1#
2# Timer Interrupt Frequency Configuration
3#
4
5choice
6 prompt "Timer frequency"
7 default HZ_250
8 help
9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts.
14 Note that the timer interrupt occurs on each processor in an SMP
15 environment leading to NR_CPUS * HZ number of timer interrupts
16 per second.
17
18
19 config HZ_100
20 bool "100 HZ"
21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring.
25
26 config HZ_250
27 bool "250 HZ"
28 help
29 250 HZ is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems.
32
33 config HZ_1000
34 bool "1000 HZ"
35 help
36 1000 HZ is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events.
38
39endchoice
40
41config HZ
42 int
43 default 100 if HZ_100
44 default 250 if HZ_250
45 default 1000 if HZ_1000
46
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
1
2choice
3 prompt "Preemption Model"
4 default PREEMPT_NONE
5
6config PREEMPT_NONE
7 bool "No Forced Preemption (Server)"
8 help
9 This is the traditional Linux preemption model, geared towards
10 throughput. It will still provide good latencies most of the
11 time, but there are no guarantees and occasional longer delays
12 are possible.
13
14 Select this option if you are building a kernel for a server or
15 scientific/computation system, or if you want to maximize the
16 raw processing power of the kernel, irrespective of scheduling
17 latencies.
18
19config PREEMPT_VOLUNTARY
20 bool "Voluntary Kernel Preemption (Desktop)"
21 help
22 This option reduces the latency of the kernel by adding more
23 "explicit preemption points" to the kernel code. These new
24 preemption points have been selected to reduce the maximum
25 latency of rescheduling, providing faster application reactions,
26 at the cost of slighly lower throughput.
27
28 This allows reaction to interactive events by allowing a
29 low priority process to voluntarily preempt itself even if it
30 is in kernel mode executing a system call. This allows
31 applications to run more 'smoothly' even when the system is
32 under load.
33
34 Select this if you are building a kernel for a desktop system.
35
36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 help
39 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section)
41 preemptible. This allows reaction to interactive events by
42 permitting a low priority process to be preempted involuntarily
43 even if it is in kernel mode executing a system call and would
44 otherwise not be about to reach a natural preemption point.
45 This allows applications to run more 'smoothly' even when the
46 system is under load, at the cost of slighly lower throughput
47 and a slight runtime overhead to kernel code.
48
49 Select this if you are building a kernel for a desktop or
50 embedded system with latency requirements in the milliseconds
51 range.
52
53endchoice
54
55config PREEMPT_BKL
56 bool "Preempt The Big Kernel Lock"
57 depends on SMP || PREEMPT
58 default y
59 help
60 This option reduces the latency of the kernel by making the
61 big kernel lock preemptible.
62
63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/ 18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_KEXEC) += kexec.o
20obj-$(CONFIG_COMPAT) += compat.o 21obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o 22obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o 23obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
27obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
28obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
29obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
30obj-$(CONFIG_SECCOMP) += seccomp.o 32obj-$(CONFIG_SECCOMP) += seccomp.o
31 33
32ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 34ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
63{ 63{
64 int err; 64 int err;
65 65
66 /* Take offline: makes arch_cpu_down somewhat easier. */
67 cpu_clear(smp_processor_id(), cpu_online_map);
68
69 /* Ensure this CPU doesn't handle any more interrupts. */ 66 /* Ensure this CPU doesn't handle any more interrupts. */
70 err = __cpu_disable(); 67 err = __cpu_disable();
71 if (err < 0) 68 if (err < 0)
72 cpu_set(smp_processor_id(), cpu_online_map); 69 return err;
73 else
74 /* Force idle task to run as soon as we yield: it should
75 immediately notice cpu is offline and die quickly. */
76 sched_idle_next();
77 70
78 return err; 71 /* Force idle task to run as soon as we yield: it should
72 immediately notice cpu is offline and die quickly. */
73 sched_idle_next();
74 return 0;
79} 75}
80 76
81int cpu_down(unsigned int cpu) 77int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
228 228
229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) 229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
230{ 230{
231 struct qstr qstr; 231 struct dentry *d = lookup_one_len(name, parent, strlen(name));
232 struct dentry *d;
233
234 qstr.name = name;
235 qstr.len = strlen(name);
236 qstr.hash = full_name_hash(name, qstr.len);
237 d = lookup_hash(&qstr, parent);
238 if (!IS_ERR(d)) 232 if (!IS_ERR(d))
239 d->d_op = &cpuset_dops; 233 d->d_op = &cpuset_dops;
240 return d; 234 return d;
@@ -601,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
601 return 0; 595 return 0;
602} 596}
603 597
598/*
599 * For a given cpuset cur, partition the system as follows
600 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
601 * exclusive child cpusets
602 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
603 * exclusive child cpusets
604 * Build these two partitions by calling partition_sched_domains
605 *
606 * Call with cpuset_sem held. May nest a call to the
607 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
608 */
609static void update_cpu_domains(struct cpuset *cur)
610{
611 struct cpuset *c, *par = cur->parent;
612 cpumask_t pspan, cspan;
613
614 if (par == NULL || cpus_empty(cur->cpus_allowed))
615 return;
616
617 /*
618 * Get all cpus from parent's cpus_allowed not part of exclusive
619 * children
620 */
621 pspan = par->cpus_allowed;
622 list_for_each_entry(c, &par->children, sibling) {
623 if (is_cpu_exclusive(c))
624 cpus_andnot(pspan, pspan, c->cpus_allowed);
625 }
626 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
627 cpus_or(pspan, pspan, cur->cpus_allowed);
628 if (cpus_equal(pspan, cur->cpus_allowed))
629 return;
630 cspan = CPU_MASK_NONE;
631 } else {
632 if (cpus_empty(pspan))
633 return;
634 cspan = cur->cpus_allowed;
635 /*
636 * Get all cpus from current cpuset's cpus_allowed not part
637 * of exclusive children
638 */
639 list_for_each_entry(c, &cur->children, sibling) {
640 if (is_cpu_exclusive(c))
641 cpus_andnot(cspan, cspan, c->cpus_allowed);
642 }
643 }
644
645 lock_cpu_hotplug();
646 partition_sched_domains(&pspan, &cspan);
647 unlock_cpu_hotplug();
648}
649
604static int update_cpumask(struct cpuset *cs, char *buf) 650static int update_cpumask(struct cpuset *cs, char *buf)
605{ 651{
606 struct cpuset trialcs; 652 struct cpuset trialcs;
607 int retval; 653 int retval, cpus_unchanged;
608 654
609 trialcs = *cs; 655 trialcs = *cs;
610 retval = cpulist_parse(buf, trialcs.cpus_allowed); 656 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -614,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
614 if (cpus_empty(trialcs.cpus_allowed)) 660 if (cpus_empty(trialcs.cpus_allowed))
615 return -ENOSPC; 661 return -ENOSPC;
616 retval = validate_change(cs, &trialcs); 662 retval = validate_change(cs, &trialcs);
617 if (retval == 0) 663 if (retval < 0)
618 cs->cpus_allowed = trialcs.cpus_allowed; 664 return retval;
619 return retval; 665 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
666 cs->cpus_allowed = trialcs.cpus_allowed;
667 if (is_cpu_exclusive(cs) && !cpus_unchanged)
668 update_cpu_domains(cs);
669 return 0;
620} 670}
621 671
622static int update_nodemask(struct cpuset *cs, char *buf) 672static int update_nodemask(struct cpuset *cs, char *buf)
@@ -652,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
652{ 702{
653 int turning_on; 703 int turning_on;
654 struct cpuset trialcs; 704 struct cpuset trialcs;
655 int err; 705 int err, cpu_exclusive_changed;
656 706
657 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 707 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
658 708
@@ -663,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
663 clear_bit(bit, &trialcs.flags); 713 clear_bit(bit, &trialcs.flags);
664 714
665 err = validate_change(cs, &trialcs); 715 err = validate_change(cs, &trialcs);
666 if (err == 0) { 716 if (err < 0)
667 if (turning_on) 717 return err;
668 set_bit(bit, &cs->flags); 718 cpu_exclusive_changed =
669 else 719 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
670 clear_bit(bit, &cs->flags); 720 if (turning_on)
671 } 721 set_bit(bit, &cs->flags);
672 return err; 722 else
723 clear_bit(bit, &cs->flags);
724
725 if (cpu_exclusive_changed)
726 update_cpu_domains(cs);
727 return 0;
673} 728}
674 729
675static int attach_task(struct cpuset *cs, char *buf) 730static int attach_task(struct cpuset *cs, char *buf)
@@ -1315,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1315 up(&cpuset_sem); 1370 up(&cpuset_sem);
1316 return -EBUSY; 1371 return -EBUSY;
1317 } 1372 }
1318 spin_lock(&cs->dentry->d_lock);
1319 parent = cs->parent; 1373 parent = cs->parent;
1320 set_bit(CS_REMOVED, &cs->flags); 1374 set_bit(CS_REMOVED, &cs->flags);
1375 if (is_cpu_exclusive(cs))
1376 update_cpu_domains(cs);
1321 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1377 list_del(&cs->sibling); /* delete my sibling from parent->children */
1322 if (list_empty(&parent->children)) 1378 if (list_empty(&parent->children))
1323 check_for_release(parent); 1379 check_for_release(parent);
1380 spin_lock(&cs->dentry->d_lock);
1324 d = dget(cs->dentry); 1381 d = dget(cs->dentry);
1325 cs->dentry = NULL; 1382 cs->dentry = NULL;
1326 spin_unlock(&d->d_lock); 1383 spin_unlock(&d->d_lock);
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..459ba49e376a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,52 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/*
22 * Copy a page from "oldmem". For this page, there is no pte mapped
23 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
24 */
25ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
26 size_t csize, unsigned long offset, int userbuf)
27{
28 void *page, *vaddr;
29
30 if (!csize)
31 return 0;
32
33 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
34 if (!page)
35 return -ENOMEM;
36
37 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
38 copy_page(page, vaddr);
39 kunmap_atomic(vaddr, KM_PTE0);
40
41 if (userbuf) {
42 if (copy_to_user(buf, (page + offset), csize)) {
43 kfree(page);
44 return -EFAULT;
45 }
46 } else {
47 memcpy(buf, (page + offset), csize);
48 }
49
50 kfree(page);
51 return csize;
52}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2ef2ad540201..9d1b10ed0135 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
73 __exit_signal(p); 73 __exit_signal(p);
74 __exit_sighand(p); 74 __exit_sighand(p);
75 /*
76 * Note that the fastpath in sys_times depends on __exit_signal having
77 * updated the counters before a task is removed from the tasklist of
78 * the process by __unhash_process.
79 */
75 __unhash_process(p); 80 __unhash_process(p);
76 81
77 /* 82 /*
@@ -779,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
779 784
780 profile_task_exit(tsk); 785 profile_task_exit(tsk);
781 786
787 WARN_ON(atomic_read(&tsk->fs_excl));
788
782 if (unlikely(in_interrupt())) 789 if (unlikely(in_interrupt()))
783 panic("Aiee, killing interrupt handler!"); 790 panic("Aiee, killing interrupt handler!");
784 if (unlikely(!tsk->pid)) 791 if (unlikely(!tsk->pid))
@@ -793,6 +800,17 @@ fastcall NORET_TYPE void do_exit(long code)
793 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); 800 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
794 } 801 }
795 802
803 /*
804 * We're taking recursive faults here in do_exit. Safest is to just
805 * leave this task alone and wait for reboot.
806 */
807 if (unlikely(tsk->flags & PF_EXITING)) {
808 printk(KERN_ALERT
809 "Fixing recursive fault but reboot is needed!\n");
810 set_current_state(TASK_UNINTERRUPTIBLE);
811 schedule();
812 }
813
796 tsk->flags |= PF_EXITING; 814 tsk->flags |= PF_EXITING;
797 815
798 /* 816 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f88699..cdef6cea8900 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
194 mm->mmap = NULL; 194 mm->mmap = NULL;
195 mm->mmap_cache = NULL; 195 mm->mmap_cache = NULL;
196 mm->free_area_cache = oldmm->mmap_base; 196 mm->free_area_cache = oldmm->mmap_base;
197 mm->cached_hole_size = ~0UL;
197 mm->map_count = 0; 198 mm->map_count = 0;
198 set_mm_counter(mm, rss, 0); 199 set_mm_counter(mm, rss, 0);
199 set_mm_counter(mm, anon_rss, 0); 200 set_mm_counter(mm, anon_rss, 0);
@@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
249 250
250 /* 251 /*
251 * Link in the new vma and copy the page table entries: 252 * Link in the new vma and copy the page table entries:
252 * link in first so that swapoff can see swap entries, 253 * link in first so that swapoff can see swap entries.
253 * and try_to_unmap_one's find_vma find the new vma. 254 * Note that, exceptionally, here the vma is inserted
255 * without holding mm->mmap_sem.
254 */ 256 */
255 spin_lock(&mm->page_table_lock); 257 spin_lock(&mm->page_table_lock);
256 *pprev = tmp; 258 *pprev = tmp;
@@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
322 mm->ioctx_list = NULL; 324 mm->ioctx_list = NULL;
323 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); 325 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
324 mm->free_area_cache = TASK_UNMAPPED_BASE; 326 mm->free_area_cache = TASK_UNMAPPED_BASE;
327 mm->cached_hole_size = ~0UL;
325 328
326 if (likely(!mm_alloc_pgd(mm))) { 329 if (likely(!mm_alloc_pgd(mm))) {
327 mm->def_flags = 0; 330 mm->def_flags = 0;
@@ -1000,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
1000 p->pdeath_signal = 0; 1003 p->pdeath_signal = 0;
1001 p->exit_state = 0; 1004 p->exit_state = 0;
1002 1005
1003 /* Perform scheduler related setup */
1004 sched_fork(p);
1005
1006 /* 1006 /*
1007 * Ok, make it visible to the rest of the system. 1007 * Ok, make it visible to the rest of the system.
1008 * We dont wake it up yet. 1008 * We dont wake it up yet.
@@ -1011,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
1011 INIT_LIST_HEAD(&p->ptrace_children); 1011 INIT_LIST_HEAD(&p->ptrace_children);
1012 INIT_LIST_HEAD(&p->ptrace_list); 1012 INIT_LIST_HEAD(&p->ptrace_list);
1013 1013
1014 /* Perform scheduler related setup. Assign this task to a CPU. */
1015 sched_fork(p, clone_flags);
1016
1014 /* Need tasklist lock for parent etc handling! */ 1017 /* Need tasklist lock for parent etc handling! */
1015 write_lock_irq(&tasklist_lock); 1018 write_lock_irq(&tasklist_lock);
1016 1019
1017 /* 1020 /*
1018 * The task hasn't been attached yet, so cpus_allowed mask cannot 1021 * The task hasn't been attached yet, so its cpus_allowed mask will
1019 * have changed. The cpus_allowed mask of the parent may have 1022 * not be changed, nor will its assigned CPU.
1020 * changed after it was copied first time, and it may then move to 1023 *
1021 * another CPU - so we re-copy it here and set the child's CPU to 1024 * The cpus_allowed mask of the parent may have changed after it was
1022 * the parent's CPU. This avoids alot of nasty races. 1025 * copied first time - so re-copy it here, then check the child's CPU
1026 * to ensure it is on a valid CPU (and if not, just force it back to
1027 * parent's CPU). This avoids alot of nasty races.
1023 */ 1028 */
1024 p->cpus_allowed = current->cpus_allowed; 1029 p->cpus_allowed = current->cpus_allowed;
1025 set_task_cpu(p, smp_processor_id()); 1030 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
1031 set_task_cpu(p, smp_processor_id());
1026 1032
1027 /* 1033 /*
1028 * Check for pending SIGKILL! The new thread should not be allowed 1034 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1084,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
1084 spin_unlock(&current->sighand->siglock); 1090 spin_unlock(&current->sighand->siglock);
1085 } 1091 }
1086 1092
1093 /*
1094 * inherit ioprio
1095 */
1096 p->ioprio = current->ioprio;
1097
1087 SET_LINKS(p); 1098 SET_LINKS(p);
1088 if (unlikely(p->ptrace & PT_PTRACED)) 1099 if (unlikely(p->ptrace & PT_PTRACED))
1089 __ptrace_link(p, current->parent); 1100 __ptrace_link(p, current->parent);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 98d62d8efeaf..3467097ca61a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h>
12 13
13/* 14/*
14 * Autodetection depends on the fact that any interrupt that 15 * Autodetection depends on the fact that any interrupt that
@@ -26,7 +27,7 @@ static DECLARE_MUTEX(probe_sem);
26 */ 27 */
27unsigned long probe_irq_on(void) 28unsigned long probe_irq_on(void)
28{ 29{
29 unsigned long val, delay; 30 unsigned long val;
30 irq_desc_t *desc; 31 irq_desc_t *desc;
31 unsigned int i; 32 unsigned int i;
32 33
@@ -45,8 +46,7 @@ unsigned long probe_irq_on(void)
45 } 46 }
46 47
47 /* Wait for longstanding interrupts to trigger. */ 48 /* Wait for longstanding interrupts to trigger. */
48 for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) 49 msleep(20);
49 /* about 20ms delay */ barrier();
50 50
51 /* 51 /*
52 * enable any unassigned irqs 52 * enable any unassigned irqs
@@ -68,8 +68,7 @@ unsigned long probe_irq_on(void)
68 /* 68 /*
69 * Wait for spurious interrupts to trigger 69 * Wait for spurious interrupts to trigger
70 */ 70 */
71 for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) 71 msleep(100);
72 /* about 100ms delay */ barrier();
73 72
74 /* 73 /*
75 * Now filter out any obviously spurious interrupts 74 * Now filter out any obviously spurious interrupts
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 436c7d93c00a..c29f83c16497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -172,7 +172,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
172 172
173 spin_lock(&desc->lock); 173 spin_lock(&desc->lock);
174 if (!noirqdebug) 174 if (!noirqdebug)
175 note_interrupt(irq, desc, action_ret); 175 note_interrupt(irq, desc, action_ret, regs);
176 if (likely(!(desc->status & IRQ_PENDING))) 176 if (likely(!(desc->status & IRQ_PENDING)))
177 break; 177 break;
178 desc->status &= ~IRQ_PENDING; 178 desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5202e4c4a5b6..ac6700985705 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -6,6 +6,7 @@
6 * This file contains driver APIs to the irq subsystem. 6 * This file contains driver APIs to the irq subsystem.
7 */ 7 */
8 8
9#include <linux/config.h>
9#include <linux/irq.h> 10#include <linux/irq.h>
10#include <linux/module.h> 11#include <linux/module.h>
11#include <linux/random.h> 12#include <linux/random.h>
@@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id)
255 256
256 /* Found it - now remove it from the list of entries */ 257 /* Found it - now remove it from the list of entries */
257 *pp = action->next; 258 *pp = action->next;
259
260 /* Currently used only by UML, might disappear one day.*/
261#ifdef CONFIG_IRQ_RELEASE_METHOD
262 if (desc->handler->release)
263 desc->handler->release(irq, dev_id);
264#endif
265
258 if (!desc->action) { 266 if (!desc->action) {
259 desc->status |= IRQ_DISABLED; 267 desc->status |= IRQ_DISABLED;
260 if (desc->handler->shutdown) 268 if (desc->handler->shutdown)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..7df9abd5ec86 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,6 +11,83 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup;
15
16/*
17 * Recovery handler for misrouted interrupts.
18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs)
21{
22 int i;
23 irq_desc_t *desc;
24 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */
26
27 for(i = 1; i < NR_IRQS; i++) {
28 struct irqaction *action;
29
30 if (i == irq) /* Already tried */
31 continue;
32 desc = &irq_desc[i];
33 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) {
37 /*
38 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too
40 */
41 if (desc->action && (desc->action->flags & SA_SHIRQ))
42 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock);
44 continue;
45 }
46 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS;
48 spin_unlock(&desc->lock);
49 while (action) {
50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) {
52 if (action->handler(i, action->dev_id, regs) ==
53 IRQ_HANDLED)
54 ok = 1;
55 }
56 action = action->next;
57 }
58 local_irq_disable();
59 /* Now clean up the flags */
60 spin_lock(&desc->lock);
61 action = desc->action;
62
63 /*
64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk
66 */
67
68 while ((desc->status & IRQ_PENDING) && action) {
69 /*
70 * Perform real IRQ processing for the IRQ we deferred
71 */
72 work = 1;
73 spin_unlock(&desc->lock);
74 handle_IRQ_event(i, regs, action);
75 spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if(work)
84 desc->handler->end(i);
85 spin_unlock(&desc->lock);
86 }
87 /* So the caller can adjust the irq error counts */
88 return ok;
89}
90
14/* 91/*
15 * If 99,900 of the previous 100,000 interrupts have not been handled 92 * If 99,900 of the previous 100,000 interrupts have not been handled
16 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 93 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -31,7 +108,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
31 printk(KERN_ERR "irq event %d: bogus return value %x\n", 108 printk(KERN_ERR "irq event %d: bogus return value %x\n",
32 irq, action_ret); 109 irq, action_ret);
33 } else { 110 } else {
34 printk(KERN_ERR "irq %d: nobody cared!\n", irq); 111 printk(KERN_ERR "irq %d: nobody cared (try booting with "
112 "the \"irqpoll\" option)\n", irq);
35 } 113 }
36 dump_stack(); 114 dump_stack();
37 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
@@ -45,7 +123,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
45 } 123 }
46} 124}
47 125
48void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
49{ 127{
50 static int count = 100; 128 static int count = 100;
51 129
@@ -55,7 +133,8 @@ void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
55 } 133 }
56} 134}
57 135
58void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
137 struct pt_regs *regs)
59{ 138{
60 if (action_ret != IRQ_HANDLED) { 139 if (action_ret != IRQ_HANDLED) {
61 desc->irqs_unhandled++; 140 desc->irqs_unhandled++;
@@ -63,6 +142,15 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
63 report_bad_irq(irq, desc, action_ret); 142 report_bad_irq(irq, desc, action_ret);
64 } 143 }
65 144
145 if (unlikely(irqfixup)) {
146 /* Don't punish working computers */
147 if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
148 int ok = misrouted_irq(irq, regs);
149 if (action_ret == IRQ_NONE)
150 desc->irqs_unhandled -= ok;
151 }
152 }
153
66 desc->irq_count++; 154 desc->irq_count++;
67 if (desc->irq_count < 100000) 155 if (desc->irq_count < 100000)
68 return; 156 return;
@@ -94,3 +182,24 @@ int __init noirqdebug_setup(char *str)
94 182
95__setup("noirqdebug", noirqdebug_setup); 183__setup("noirqdebug", noirqdebug_setup);
96 184
185static int __init irqfixup_setup(char *str)
186{
187 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n");
190 return 1;
191}
192
193__setup("irqfixup", irqfixup_setup);
194
195static int __init irqpoll_setup(char *str)
196{
197 irqfixup = 2;
198 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
199 "enabled\n");
200 printk(KERN_WARNING "This may significantly impact system "
201 "performance\n");
202 return 1;
203}
204
205__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 1dc988e0d2c7..a72cb0e5aa4b 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -153,11 +153,15 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
153 153
154 switch (which) { 154 switch (which) {
155 case ITIMER_REAL: 155 case ITIMER_REAL:
156again:
156 spin_lock_irq(&tsk->sighand->siglock); 157 spin_lock_irq(&tsk->sighand->siglock);
157 interval = tsk->signal->it_real_incr; 158 interval = tsk->signal->it_real_incr;
158 val = it_real_value(tsk->signal); 159 val = it_real_value(tsk->signal);
159 if (val) 160 /* We are sharing ->siglock with it_real_fn() */
160 del_timer_sync(&tsk->signal->real_timer); 161 if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
162 spin_unlock_irq(&tsk->sighand->siglock);
163 goto again;
164 }
161 tsk->signal->it_real_incr = 165 tsk->signal->it_real_incr =
162 timeval_to_jiffies(&value->it_interval); 166 timeval_to_jiffies(&value->it_interval);
163 it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); 167 it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..cdd4dcd8fb63
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
21#include <linux/hardirq.h>
22
23#include <asm/page.h>
24#include <asm/uaccess.h>
25#include <asm/io.h>
26#include <asm/system.h>
27#include <asm/semaphore.h>
28
29/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = {
31 .name = "Crash kernel",
32 .start = 0,
33 .end = 0,
34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
35};
36
37int kexec_should_crash(struct task_struct *p)
38{
39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 return 1;
41 return 0;
42}
43
44/*
45 * When kexec transitions to the new kernel there is a one-to-one
46 * mapping between physical and virtual addresses. On processors
47 * where you can disable the MMU this is trivial, and easy. For
48 * others it is still a simple predictable page table to setup.
49 *
50 * In that environment kexec copies the new kernel to its final
51 * resting place. This means I can only support memory whose
52 * physical address can fit in an unsigned long. In particular
53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54 * If the assembly stub has more restrictive requirements
55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56 * defined more restrictively in <asm/kexec.h>.
57 *
58 * The code for the transition from the current kernel to the
59 * the new kernel is placed in the control_code_buffer, whose size
60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
61 * page of memory is necessary, but some architectures require more.
62 * Because this memory must be identity mapped in the transition from
63 * virtual to physical addresses it must live in the range
64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65 * modifiable.
66 *
67 * The assembly stub in the control code buffer is passed a linked list
68 * of descriptor pages detailing the source pages of the new kernel,
69 * and the destination addresses of those source pages. As this data
70 * structure is not used in the context of the current OS, it must
71 * be self-contained.
72 *
73 * The code has been made to work with highmem pages and will use a
74 * destination page in its final resting place (if it happens
75 * to allocate it). The end product of this is that most of the
76 * physical address space, and most of RAM can be used.
77 *
78 * Future directions include:
79 * - allocating a page table with the control code buffer identity
80 * mapped, to simplify machine_kexec and make kexec_on_panic more
81 * reliable.
82 */
83
84/*
85 * KIMAGE_NO_DEST is an impossible destination address..., for
86 * allocating pages whose destination address we do not care about.
87 */
88#define KIMAGE_NO_DEST (-1UL)
89
90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask,
94 unsigned long dest);
95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 unsigned long nr_segments,
98 struct kexec_segment __user *segments)
99{
100 size_t segment_bytes;
101 struct kimage *image;
102 unsigned long i;
103 int result;
104
105 /* Allocate a controlling structure */
106 result = -ENOMEM;
107 image = kmalloc(sizeof(*image), GFP_KERNEL);
108 if (!image)
109 goto out;
110
111 memset(image, 0, sizeof(*image));
112 image->head = 0;
113 image->entry = &image->head;
114 image->last_entry = &image->head;
115 image->control_page = ~0; /* By default this does not apply */
116 image->start = entry;
117 image->type = KEXEC_TYPE_DEFAULT;
118
119 /* Initialize the list of control pages */
120 INIT_LIST_HEAD(&image->control_pages);
121
122 /* Initialize the list of destination pages */
123 INIT_LIST_HEAD(&image->dest_pages);
124
125 /* Initialize the list of unuseable pages */
126 INIT_LIST_HEAD(&image->unuseable_pages);
127
128 /* Read in the segments */
129 image->nr_segments = nr_segments;
130 segment_bytes = nr_segments * sizeof(*segments);
131 result = copy_from_user(image->segment, segments, segment_bytes);
132 if (result)
133 goto out;
134
135 /*
136 * Verify we have good destination addresses. The caller is
137 * responsible for making certain we don't attempt to load
138 * the new image into invalid or reserved areas of RAM. This
139 * just verifies it is an address we can use.
140 *
141 * Since the kernel does everything in page size chunks ensure
142 * the destination addreses are page aligned. Too many
143 * special cases crop of when we don't do this. The most
144 * insidious is getting overlapping destination addresses
145 * simply because addresses are changed to page size
146 * granularity.
147 */
148 result = -EADDRNOTAVAIL;
149 for (i = 0; i < nr_segments; i++) {
150 unsigned long mstart, mend;
151
152 mstart = image->segment[i].mem;
153 mend = mstart + image->segment[i].memsz;
154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 goto out;
156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 goto out;
158 }
159
160 /* Verify our destination addresses do not overlap.
161 * If we alloed overlapping destination addresses
162 * through very weird things can happen with no
163 * easy explanation as one segment stops on another.
164 */
165 result = -EINVAL;
166 for (i = 0; i < nr_segments; i++) {
167 unsigned long mstart, mend;
168 unsigned long j;
169
170 mstart = image->segment[i].mem;
171 mend = mstart + image->segment[i].memsz;
172 for (j = 0; j < i; j++) {
173 unsigned long pstart, pend;
174 pstart = image->segment[j].mem;
175 pend = pstart + image->segment[j].memsz;
176 /* Do the segments overlap ? */
177 if ((mend > pstart) && (mstart < pend))
178 goto out;
179 }
180 }
181
182 /* Ensure our buffer sizes are strictly less than
183 * our memory sizes. This should always be the case,
184 * and it is easier to check up front than to be surprised
185 * later on.
186 */
187 result = -EINVAL;
188 for (i = 0; i < nr_segments; i++) {
189 if (image->segment[i].bufsz > image->segment[i].memsz)
190 goto out;
191 }
192
193 result = 0;
194out:
195 if (result == 0)
196 *rimage = image;
197 else
198 kfree(image);
199
200 return result;
201
202}
203
204static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 unsigned long nr_segments,
206 struct kexec_segment __user *segments)
207{
208 int result;
209 struct kimage *image;
210
211 /* Allocate and initialize a controlling structure */
212 image = NULL;
213 result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 if (result)
215 goto out;
216
217 *rimage = image;
218
219 /*
220 * Find a location for the control code buffer, and add it
221 * the vector of segments so that it's pages will also be
222 * counted as destination pages.
223 */
224 result = -ENOMEM;
225 image->control_code_page = kimage_alloc_control_pages(image,
226 get_order(KEXEC_CONTROL_CODE_SIZE));
227 if (!image->control_code_page) {
228 printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 goto out;
230 }
231
232 result = 0;
233 out:
234 if (result == 0)
235 *rimage = image;
236 else
237 kfree(image);
238
239 return result;
240}
241
242static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 unsigned long nr_segments,
244 struct kexec_segment __user *segments)
245{
246 int result;
247 struct kimage *image;
248 unsigned long i;
249
250 image = NULL;
251 /* Verify we have a valid entry point */
252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 result = -EADDRNOTAVAIL;
254 goto out;
255 }
256
257 /* Allocate and initialize a controlling structure */
258 result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 if (result)
260 goto out;
261
262 /* Enable the special crash kernel control page
263 * allocation policy.
264 */
265 image->control_page = crashk_res.start;
266 image->type = KEXEC_TYPE_CRASH;
267
268 /*
269 * Verify we have good destination addresses. Normally
270 * the caller is responsible for making certain we don't
271 * attempt to load the new image into invalid or reserved
272 * areas of RAM. But crash kernels are preloaded into a
273 * reserved area of ram. We must ensure the addresses
274 * are in the reserved area otherwise preloading the
275 * kernel could corrupt things.
276 */
277 result = -EADDRNOTAVAIL;
278 for (i = 0; i < nr_segments; i++) {
279 unsigned long mstart, mend;
280
281 mstart = image->segment[i].mem;
282 mend = mstart + image->segment[i].memsz - 1;
283 /* Ensure we are within the crash kernel limits */
284 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 goto out;
286 }
287
288 /*
289 * Find a location for the control code buffer, and add
290 * the vector of segments so that it's pages will also be
291 * counted as destination pages.
292 */
293 result = -ENOMEM;
294 image->control_code_page = kimage_alloc_control_pages(image,
295 get_order(KEXEC_CONTROL_CODE_SIZE));
296 if (!image->control_code_page) {
297 printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 goto out;
299 }
300
301 result = 0;
302out:
303 if (result == 0)
304 *rimage = image;
305 else
306 kfree(image);
307
308 return result;
309}
310
311static int kimage_is_destination_range(struct kimage *image,
312 unsigned long start,
313 unsigned long end)
314{
315 unsigned long i;
316
317 for (i = 0; i < image->nr_segments; i++) {
318 unsigned long mstart, mend;
319
320 mstart = image->segment[i].mem;
321 mend = mstart + image->segment[i].memsz;
322 if ((end > mstart) && (start < mend))
323 return 1;
324 }
325
326 return 0;
327}
328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask,
330 unsigned int order)
331{
332 struct page *pages;
333
334 pages = alloc_pages(gfp_mask, order);
335 if (pages) {
336 unsigned int count, i;
337 pages->mapping = NULL;
338 pages->private = order;
339 count = 1 << order;
340 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i);
342 }
343
344 return pages;
345}
346
347static void kimage_free_pages(struct page *page)
348{
349 unsigned int order, count, i;
350
351 order = page->private;
352 count = 1 << order;
353 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i);
355 __free_pages(page, order);
356}
357
358static void kimage_free_page_list(struct list_head *list)
359{
360 struct list_head *pos, *next;
361
362 list_for_each_safe(pos, next, list) {
363 struct page *page;
364
365 page = list_entry(pos, struct page, lru);
366 list_del(&page->lru);
367 kimage_free_pages(page);
368 }
369}
370
371static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
372 unsigned int order)
373{
374 /* Control pages are special, they are the intermediaries
375 * that are needed while we copy the rest of the pages
376 * to their final resting place. As such they must
377 * not conflict with either the destination addresses
378 * or memory the kernel is already using.
379 *
380 * The only case where we really need more than one of
381 * these are for architectures where we cannot disable
382 * the MMU and must instead generate an identity mapped
383 * page table for all of the memory.
384 *
385 * At worst this runs in O(N) of the image size.
386 */
387 struct list_head extra_pages;
388 struct page *pages;
389 unsigned int count;
390
391 count = 1 << order;
392 INIT_LIST_HEAD(&extra_pages);
393
394 /* Loop while I can allocate a page and the page allocated
395 * is a destination page.
396 */
397 do {
398 unsigned long pfn, epfn, addr, eaddr;
399
400 pages = kimage_alloc_pages(GFP_KERNEL, order);
401 if (!pages)
402 break;
403 pfn = page_to_pfn(pages);
404 epfn = pfn + count;
405 addr = pfn << PAGE_SHIFT;
406 eaddr = epfn << PAGE_SHIFT;
407 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
408 kimage_is_destination_range(image, addr, eaddr)) {
409 list_add(&pages->lru, &extra_pages);
410 pages = NULL;
411 }
412 } while (!pages);
413
414 if (pages) {
415 /* Remember the allocated page... */
416 list_add(&pages->lru, &image->control_pages);
417
418 /* Because the page is already in it's destination
419 * location we will never allocate another page at
420 * that address. Therefore kimage_alloc_pages
421 * will not return it (again) and we don't need
422 * to give it an entry in image->segment[].
423 */
424 }
425 /* Deal with the destination pages I have inadvertently allocated.
426 *
427 * Ideally I would convert multi-page allocations into single
428 * page allocations, and add everyting to image->dest_pages.
429 *
430 * For now it is simpler to just free the pages.
431 */
432 kimage_free_page_list(&extra_pages);
433
434 return pages;
435}
436
437static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
438 unsigned int order)
439{
440 /* Control pages are special, they are the intermediaries
441 * that are needed while we copy the rest of the pages
442 * to their final resting place. As such they must
443 * not conflict with either the destination addresses
444 * or memory the kernel is already using.
445 *
446 * Control pages are also the only pags we must allocate
447 * when loading a crash kernel. All of the other pages
448 * are specified by the segments and we just memcpy
449 * into them directly.
450 *
451 * The only case where we really need more than one of
452 * these are for architectures where we cannot disable
453 * the MMU and must instead generate an identity mapped
454 * page table for all of the memory.
455 *
456 * Given the low demand this implements a very simple
457 * allocator that finds the first hole of the appropriate
458 * size in the reserved memory region, and allocates all
459 * of the memory up to and including the hole.
460 */
461 unsigned long hole_start, hole_end, size;
462 struct page *pages;
463
464 pages = NULL;
465 size = (1 << order) << PAGE_SHIFT;
466 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
467 hole_end = hole_start + size - 1;
468 while (hole_end <= crashk_res.end) {
469 unsigned long i;
470
471 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
472 break;
473 if (hole_end > crashk_res.end)
474 break;
475 /* See if I overlap any of the segments */
476 for (i = 0; i < image->nr_segments; i++) {
477 unsigned long mstart, mend;
478
479 mstart = image->segment[i].mem;
480 mend = mstart + image->segment[i].memsz - 1;
481 if ((hole_end >= mstart) && (hole_start <= mend)) {
482 /* Advance the hole to the end of the segment */
483 hole_start = (mend + (size - 1)) & ~(size - 1);
484 hole_end = hole_start + size - 1;
485 break;
486 }
487 }
488 /* If I don't overlap any segments I have found my hole! */
489 if (i == image->nr_segments) {
490 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
491 break;
492 }
493 }
494 if (pages)
495 image->control_page = hole_end;
496
497 return pages;
498}
499
500
501struct page *kimage_alloc_control_pages(struct kimage *image,
502 unsigned int order)
503{
504 struct page *pages = NULL;
505
506 switch (image->type) {
507 case KEXEC_TYPE_DEFAULT:
508 pages = kimage_alloc_normal_control_pages(image, order);
509 break;
510 case KEXEC_TYPE_CRASH:
511 pages = kimage_alloc_crash_control_pages(image, order);
512 break;
513 }
514
515 return pages;
516}
517
518static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
519{
520 if (*image->entry != 0)
521 image->entry++;
522
523 if (image->entry == image->last_entry) {
524 kimage_entry_t *ind_page;
525 struct page *page;
526
527 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
528 if (!page)
529 return -ENOMEM;
530
531 ind_page = page_address(page);
532 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
533 image->entry = ind_page;
534 image->last_entry = ind_page +
535 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
536 }
537 *image->entry = entry;
538 image->entry++;
539 *image->entry = 0;
540
541 return 0;
542}
543
544static int kimage_set_destination(struct kimage *image,
545 unsigned long destination)
546{
547 int result;
548
549 destination &= PAGE_MASK;
550 result = kimage_add_entry(image, destination | IND_DESTINATION);
551 if (result == 0)
552 image->destination = destination;
553
554 return result;
555}
556
557
558static int kimage_add_page(struct kimage *image, unsigned long page)
559{
560 int result;
561
562 page &= PAGE_MASK;
563 result = kimage_add_entry(image, page | IND_SOURCE);
564 if (result == 0)
565 image->destination += PAGE_SIZE;
566
567 return result;
568}
569
570
571static void kimage_free_extra_pages(struct kimage *image)
572{
573 /* Walk through and free any extra destination pages I may have */
574 kimage_free_page_list(&image->dest_pages);
575
576 /* Walk through and free any unuseable pages I have cached */
577 kimage_free_page_list(&image->unuseable_pages);
578
579}
580static int kimage_terminate(struct kimage *image)
581{
582 if (*image->entry != 0)
583 image->entry++;
584
585 *image->entry = IND_DONE;
586
587 return 0;
588}
589
590#define for_each_kimage_entry(image, ptr, entry) \
591 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
592 ptr = (entry & IND_INDIRECTION)? \
593 phys_to_virt((entry & PAGE_MASK)): ptr +1)
594
595static void kimage_free_entry(kimage_entry_t entry)
596{
597 struct page *page;
598
599 page = pfn_to_page(entry >> PAGE_SHIFT);
600 kimage_free_pages(page);
601}
602
603static void kimage_free(struct kimage *image)
604{
605 kimage_entry_t *ptr, entry;
606 kimage_entry_t ind = 0;
607
608 if (!image)
609 return;
610
611 kimage_free_extra_pages(image);
612 for_each_kimage_entry(image, ptr, entry) {
613 if (entry & IND_INDIRECTION) {
614 /* Free the previous indirection page */
615 if (ind & IND_INDIRECTION)
616 kimage_free_entry(ind);
617 /* Save this indirection page until we are
618 * done with it.
619 */
620 ind = entry;
621 }
622 else if (entry & IND_SOURCE)
623 kimage_free_entry(entry);
624 }
625 /* Free the final indirection page */
626 if (ind & IND_INDIRECTION)
627 kimage_free_entry(ind);
628
629 /* Handle any machine specific cleanup */
630 machine_kexec_cleanup(image);
631
632 /* Free the kexec control pages... */
633 kimage_free_page_list(&image->control_pages);
634 kfree(image);
635}
636
637static kimage_entry_t *kimage_dst_used(struct kimage *image,
638 unsigned long page)
639{
640 kimage_entry_t *ptr, entry;
641 unsigned long destination = 0;
642
643 for_each_kimage_entry(image, ptr, entry) {
644 if (entry & IND_DESTINATION)
645 destination = entry & PAGE_MASK;
646 else if (entry & IND_SOURCE) {
647 if (page == destination)
648 return ptr;
649 destination += PAGE_SIZE;
650 }
651 }
652
653 return NULL;
654}
655
656static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask,
658 unsigned long destination)
659{
660 /*
661 * Here we implement safeguards to ensure that a source page
662 * is not copied to its destination page before the data on
663 * the destination page is no longer useful.
664 *
665 * To do this we maintain the invariant that a source page is
666 * either its own destination page, or it is not a
667 * destination page at all.
668 *
669 * That is slightly stronger than required, but the proof
670 * that no problems will not occur is trivial, and the
671 * implementation is simply to verify.
672 *
673 * When allocating all pages normally this algorithm will run
674 * in O(N) time, but in the worst case it will run in O(N^2)
675 * time. If the runtime is a problem the data structures can
676 * be fixed.
677 */
678 struct page *page;
679 unsigned long addr;
680
681 /*
682 * Walk through the list of destination pages, and see if I
683 * have a match.
684 */
685 list_for_each_entry(page, &image->dest_pages, lru) {
686 addr = page_to_pfn(page) << PAGE_SHIFT;
687 if (addr == destination) {
688 list_del(&page->lru);
689 return page;
690 }
691 }
692 page = NULL;
693 while (1) {
694 kimage_entry_t *old;
695
696 /* Allocate a page, if we run out of memory give up */
697 page = kimage_alloc_pages(gfp_mask, 0);
698 if (!page)
699 return NULL;
700 /* If the page cannot be used file it away */
701 if (page_to_pfn(page) >
702 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
703 list_add(&page->lru, &image->unuseable_pages);
704 continue;
705 }
706 addr = page_to_pfn(page) << PAGE_SHIFT;
707
708 /* If it is the destination page we want use it */
709 if (addr == destination)
710 break;
711
712 /* If the page is not a destination page use it */
713 if (!kimage_is_destination_range(image, addr,
714 addr + PAGE_SIZE))
715 break;
716
717 /*
718 * I know that the page is someones destination page.
719 * See if there is already a source page for this
720 * destination page. And if so swap the source pages.
721 */
722 old = kimage_dst_used(image, addr);
723 if (old) {
724 /* If so move it */
725 unsigned long old_addr;
726 struct page *old_page;
727
728 old_addr = *old & PAGE_MASK;
729 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
730 copy_highpage(page, old_page);
731 *old = addr | (*old & ~PAGE_MASK);
732
733 /* The old page I have found cannot be a
734 * destination page, so return it.
735 */
736 addr = old_addr;
737 page = old_page;
738 break;
739 }
740 else {
741 /* Place the page on the destination list I
742 * will use it later.
743 */
744 list_add(&page->lru, &image->dest_pages);
745 }
746 }
747
748 return page;
749}
750
751static int kimage_load_normal_segment(struct kimage *image,
752 struct kexec_segment *segment)
753{
754 unsigned long maddr;
755 unsigned long ubytes, mbytes;
756 int result;
757 unsigned char __user *buf;
758
759 result = 0;
760 buf = segment->buf;
761 ubytes = segment->bufsz;
762 mbytes = segment->memsz;
763 maddr = segment->mem;
764
765 result = kimage_set_destination(image, maddr);
766 if (result < 0)
767 goto out;
768
769 while (mbytes) {
770 struct page *page;
771 char *ptr;
772 size_t uchunk, mchunk;
773
774 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
775 if (page == 0) {
776 result = -ENOMEM;
777 goto out;
778 }
779 result = kimage_add_page(image, page_to_pfn(page)
780 << PAGE_SHIFT);
781 if (result < 0)
782 goto out;
783
784 ptr = kmap(page);
785 /* Start with a clear page */
786 memset(ptr, 0, PAGE_SIZE);
787 ptr += maddr & ~PAGE_MASK;
788 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
789 if (mchunk > mbytes)
790 mchunk = mbytes;
791
792 uchunk = mchunk;
793 if (uchunk > ubytes)
794 uchunk = ubytes;
795
796 result = copy_from_user(ptr, buf, uchunk);
797 kunmap(page);
798 if (result) {
799 result = (result < 0) ? result : -EIO;
800 goto out;
801 }
802 ubytes -= uchunk;
803 maddr += mchunk;
804 buf += mchunk;
805 mbytes -= mchunk;
806 }
807out:
808 return result;
809}
810
811static int kimage_load_crash_segment(struct kimage *image,
812 struct kexec_segment *segment)
813{
814 /* For crash dumps kernels we simply copy the data from
815 * user space to it's destination.
816 * We do things a page at a time for the sake of kmap.
817 */
818 unsigned long maddr;
819 unsigned long ubytes, mbytes;
820 int result;
821 unsigned char __user *buf;
822
823 result = 0;
824 buf = segment->buf;
825 ubytes = segment->bufsz;
826 mbytes = segment->memsz;
827 maddr = segment->mem;
828 while (mbytes) {
829 struct page *page;
830 char *ptr;
831 size_t uchunk, mchunk;
832
833 page = pfn_to_page(maddr >> PAGE_SHIFT);
834 if (page == 0) {
835 result = -ENOMEM;
836 goto out;
837 }
838 ptr = kmap(page);
839 ptr += maddr & ~PAGE_MASK;
840 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
841 if (mchunk > mbytes)
842 mchunk = mbytes;
843
844 uchunk = mchunk;
845 if (uchunk > ubytes) {
846 uchunk = ubytes;
847 /* Zero the trailing part of the page */
848 memset(ptr + uchunk, 0, mchunk - uchunk);
849 }
850 result = copy_from_user(ptr, buf, uchunk);
851 kunmap(page);
852 if (result) {
853 result = (result < 0) ? result : -EIO;
854 goto out;
855 }
856 ubytes -= uchunk;
857 maddr += mchunk;
858 buf += mchunk;
859 mbytes -= mchunk;
860 }
861out:
862 return result;
863}
864
865static int kimage_load_segment(struct kimage *image,
866 struct kexec_segment *segment)
867{
868 int result = -ENOMEM;
869
870 switch (image->type) {
871 case KEXEC_TYPE_DEFAULT:
872 result = kimage_load_normal_segment(image, segment);
873 break;
874 case KEXEC_TYPE_CRASH:
875 result = kimage_load_crash_segment(image, segment);
876 break;
877 }
878
879 return result;
880}
881
882/*
883 * Exec Kernel system call: for obvious reasons only root may call it.
884 *
885 * This call breaks up into three pieces.
886 * - A generic part which loads the new kernel from the current
887 * address space, and very carefully places the data in the
888 * allocated pages.
889 *
890 * - A generic part that interacts with the kernel and tells all of
891 * the devices to shut down. Preventing on-going dmas, and placing
892 * the devices in a consistent state so a later kernel can
893 * reinitialize them.
894 *
895 * - A machine specific part that includes the syscall number
896 * and the copies the image to it's final destination. And
897 * jumps into the image at entry.
898 *
899 * kexec does not sync, or unmount filesystems so if you need
900 * that to happen you need to do that yourself.
901 */
902struct kimage *kexec_image = NULL;
903static struct kimage *kexec_crash_image = NULL;
904/*
905 * A home grown binary mutex.
906 * Nothing can wait so this mutex is safe to use
907 * in interrupt context :)
908 */
909static int kexec_lock = 0;
910
911asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
912 struct kexec_segment __user *segments,
913 unsigned long flags)
914{
915 struct kimage **dest_image, *image;
916 int locked;
917 int result;
918
919 /* We only trust the superuser with rebooting the system. */
920 if (!capable(CAP_SYS_BOOT))
921 return -EPERM;
922
923 /*
924 * Verify we have a legal set of flags
925 * This leaves us room for future extensions.
926 */
927 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
928 return -EINVAL;
929
930 /* Verify we are on the appropriate architecture */
931 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
932 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
933 return -EINVAL;
934
935 /* Put an artificial cap on the number
936 * of segments passed to kexec_load.
937 */
938 if (nr_segments > KEXEC_SEGMENT_MAX)
939 return -EINVAL;
940
941 image = NULL;
942 result = 0;
943
944 /* Because we write directly to the reserved memory
945 * region when loading crash kernels we need a mutex here to
946 * prevent multiple crash kernels from attempting to load
947 * simultaneously, and to prevent a crash kernel from loading
948 * over the top of a in use crash kernel.
949 *
950 * KISS: always take the mutex.
951 */
952 locked = xchg(&kexec_lock, 1);
953 if (locked)
954 return -EBUSY;
955
956 dest_image = &kexec_image;
957 if (flags & KEXEC_ON_CRASH)
958 dest_image = &kexec_crash_image;
959 if (nr_segments > 0) {
960 unsigned long i;
961
962 /* Loading another kernel to reboot into */
963 if ((flags & KEXEC_ON_CRASH) == 0)
964 result = kimage_normal_alloc(&image, entry,
965 nr_segments, segments);
966 /* Loading another kernel to switch to if this one crashes */
967 else if (flags & KEXEC_ON_CRASH) {
968 /* Free any current crash dump kernel before
969 * we corrupt it.
970 */
971 kimage_free(xchg(&kexec_crash_image, NULL));
972 result = kimage_crash_alloc(&image, entry,
973 nr_segments, segments);
974 }
975 if (result)
976 goto out;
977
978 result = machine_kexec_prepare(image);
979 if (result)
980 goto out;
981
982 for (i = 0; i < nr_segments; i++) {
983 result = kimage_load_segment(image, &image->segment[i]);
984 if (result)
985 goto out;
986 }
987 result = kimage_terminate(image);
988 if (result)
989 goto out;
990 }
991 /* Install the new kernel, and Uninstall the old */
992 image = xchg(dest_image, image);
993
994out:
995 xchg(&kexec_lock, 0); /* Release the mutex */
996 kimage_free(image);
997
998 return result;
999}
1000
1001#ifdef CONFIG_COMPAT
1002asmlinkage long compat_sys_kexec_load(unsigned long entry,
1003 unsigned long nr_segments,
1004 struct compat_kexec_segment __user *segments,
1005 unsigned long flags)
1006{
1007 struct compat_kexec_segment in;
1008 struct kexec_segment out, __user *ksegments;
1009 unsigned long i, result;
1010
1011 /* Don't allow clients that don't understand the native
1012 * architecture to do anything.
1013 */
1014 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1015 return -EINVAL;
1016
1017 if (nr_segments > KEXEC_SEGMENT_MAX)
1018 return -EINVAL;
1019
1020 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1021 for (i=0; i < nr_segments; i++) {
1022 result = copy_from_user(&in, &segments[i], sizeof(in));
1023 if (result)
1024 return -EFAULT;
1025
1026 out.buf = compat_ptr(in.buf);
1027 out.bufsz = in.bufsz;
1028 out.mem = in.mem;
1029 out.memsz = in.memsz;
1030
1031 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1032 if (result)
1033 return -EFAULT;
1034 }
1035
1036 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1037}
1038#endif
1039
1040void crash_kexec(struct pt_regs *regs)
1041{
1042 struct kimage *image;
1043 int locked;
1044
1045
1046 /* Take the kexec_lock here to prevent sys_kexec_load
1047 * running on one cpu from replacing the crash kernel
1048 * we are using after a panic on a different cpu.
1049 *
1050 * If the crash kernel was not located in a fixed area
1051 * of memory the xchg(&kexec_crash_image) would be
1052 * sufficient. But since I reuse the memory...
1053 */
1054 locked = xchg(&kexec_lock, 1);
1055 if (!locked) {
1056 image = xchg(&kexec_crash_image, NULL);
1057 if (image) {
1058 machine_crash_shutdown(regs);
1059 machine_kexec(image);
1060 }
1061 xchg(&kexec_lock, 0);
1062 }
1063}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
120 char *path; 120 char *path;
121 char **argv; 121 char **argv;
122 char **envp; 122 char **envp;
123 struct key *ring;
123 int wait; 124 int wait;
124 int retval; 125 int retval;
125}; 126};
@@ -130,16 +131,21 @@ struct subprocess_info {
130static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
131{ 132{
132 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session;
133 int retval; 135 int retval;
134 136
135 /* Unblock all signals. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring);
136 flush_signals(current); 139 flush_signals(current);
137 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring);
138 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
139 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
140 recalc_sigpending(); 144 recalc_sigpending();
141 spin_unlock_irq(&current->sighand->siglock); 145 spin_unlock_irq(&current->sighand->siglock);
142 146
147 key_put(old_session);
148
143 /* We can run anywhere, unlike our parent keventd(). */ 149 /* We can run anywhere, unlike our parent keventd(). */
144 set_cpus_allowed(current, CPU_MASK_ALL); 150 set_cpus_allowed(current, CPU_MASK_ALL);
145 151
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
211} 217}
212 218
213/** 219/**
214 * call_usermodehelper - start a usermode application 220 * call_usermodehelper_keys - start a usermode application
215 * @path: pathname for the application 221 * @path: pathname for the application
216 * @argv: null-terminated argument list 222 * @argv: null-terminated argument list
217 * @envp: null-terminated environment list 223 * @envp: null-terminated environment list
224 * @session_keyring: session keyring for process (NULL for an empty keyring)
218 * @wait: wait for the application to finish and return status. 225 * @wait: wait for the application to finish and return status.
219 * 226 *
220 * Runs a user-space application. The application is started 227 * Runs a user-space application. The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
224 * Must be called from process context. Returns a negative error code 231 * Must be called from process context. Returns a negative error code
225 * if program was not execed successfully, or 0. 232 * if program was not execed successfully, or 0.
226 */ 233 */
227int call_usermodehelper(char *path, char **argv, char **envp, int wait) 234int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait)
228{ 236{
229 DECLARE_COMPLETION(done); 237 DECLARE_COMPLETION(done);
230 struct subprocess_info sub_info = { 238 struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
232 .path = path, 240 .path = path,
233 .argv = argv, 241 .argv = argv,
234 .envp = envp, 242 .envp = envp,
243 .ring = session_keyring,
235 .wait = wait, 244 .wait = wait,
236 .retval = 0, 245 .retval = 0,
237 }; 246 };
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
247 wait_for_completion(&done); 256 wait_for_completion(&done);
248 return sub_info.retval; 257 return sub_info.retval;
249} 258}
250EXPORT_SYMBOL(call_usermodehelper); 259EXPORT_SYMBOL(call_usermodehelper_keys);
251 260
252void __init usermodehelper_init(void) 261void __init usermodehelper_init(void)
253{ 262{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..90c0e82b650c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,12 +27,16 @@
27 * interface to access function arguments. 27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes 28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list. 29 * exceptions notifier to be first on the priority list.
30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
32 * <prasanna@in.ibm.com> added function-return probes.
30 */ 33 */
31#include <linux/kprobes.h> 34#include <linux/kprobes.h>
32#include <linux/spinlock.h> 35#include <linux/spinlock.h>
33#include <linux/hash.h> 36#include <linux/hash.h>
34#include <linux/init.h> 37#include <linux/init.h>
35#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h>
36#include <asm/cacheflush.h> 40#include <asm/cacheflush.h>
37#include <asm/errno.h> 41#include <asm/errno.h>
38#include <asm/kdebug.h> 42#include <asm/kdebug.h>
@@ -41,11 +45,112 @@
41#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 45#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
42 46
43static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 47static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
48static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
44 49
45unsigned int kprobe_cpu = NR_CPUS; 50unsigned int kprobe_cpu = NR_CPUS;
46static DEFINE_SPINLOCK(kprobe_lock); 51static DEFINE_SPINLOCK(kprobe_lock);
47static struct kprobe *curr_kprobe; 52static struct kprobe *curr_kprobe;
48 53
54/*
55 * kprobe->ainsn.insn points to the copy of the instruction to be
56 * single-stepped. x86_64, POWER4 and above have no-exec support and
57 * stepping on the instruction on a vmalloced/kmalloced/data page
58 * is a recipe for disaster
59 */
60#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
61
62struct kprobe_insn_page {
63 struct hlist_node hlist;
64 kprobe_opcode_t *insns; /* Page of instruction slots */
65 char slot_used[INSNS_PER_PAGE];
66 int nused;
67};
68
69static struct hlist_head kprobe_insn_pages;
70
71/**
72 * get_insn_slot() - Find a slot on an executable page for an instruction.
73 * We allocate an executable page if there's no room on existing ones.
74 */
75kprobe_opcode_t *get_insn_slot(void)
76{
77 struct kprobe_insn_page *kip;
78 struct hlist_node *pos;
79
80 hlist_for_each(pos, &kprobe_insn_pages) {
81 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
82 if (kip->nused < INSNS_PER_PAGE) {
83 int i;
84 for (i = 0; i < INSNS_PER_PAGE; i++) {
85 if (!kip->slot_used[i]) {
86 kip->slot_used[i] = 1;
87 kip->nused++;
88 return kip->insns + (i * MAX_INSN_SIZE);
89 }
90 }
91 /* Surprise! No unused slots. Fix kip->nused. */
92 kip->nused = INSNS_PER_PAGE;
93 }
94 }
95
96 /* All out of space. Need to allocate a new page. Use slot 0.*/
97 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
98 if (!kip) {
99 return NULL;
100 }
101
102 /*
103 * Use module_alloc so this page is within +/- 2GB of where the
104 * kernel image and loaded module images reside. This is required
105 * so x86_64 can correctly handle the %rip-relative fixups.
106 */
107 kip->insns = module_alloc(PAGE_SIZE);
108 if (!kip->insns) {
109 kfree(kip);
110 return NULL;
111 }
112 INIT_HLIST_NODE(&kip->hlist);
113 hlist_add_head(&kip->hlist, &kprobe_insn_pages);
114 memset(kip->slot_used, 0, INSNS_PER_PAGE);
115 kip->slot_used[0] = 1;
116 kip->nused = 1;
117 return kip->insns;
118}
119
120void free_insn_slot(kprobe_opcode_t *slot)
121{
122 struct kprobe_insn_page *kip;
123 struct hlist_node *pos;
124
125 hlist_for_each(pos, &kprobe_insn_pages) {
126 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
127 if (kip->insns <= slot &&
128 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
129 int i = (slot - kip->insns) / MAX_INSN_SIZE;
130 kip->slot_used[i] = 0;
131 kip->nused--;
132 if (kip->nused == 0) {
133 /*
134 * Page is no longer in use. Free it unless
135 * it's the last one. We keep the last one
136 * so as not to have to set it up again the
137 * next time somebody inserts a probe.
138 */
139 hlist_del(&kip->hlist);
140 if (hlist_empty(&kprobe_insn_pages)) {
141 INIT_HLIST_NODE(&kip->hlist);
142 hlist_add_head(&kip->hlist,
143 &kprobe_insn_pages);
144 } else {
145 module_free(NULL, kip->insns);
146 kfree(kip);
147 }
148 }
149 return;
150 }
151 }
152}
153
49/* Locks kprobe: irqs must be disabled */ 154/* Locks kprobe: irqs must be disabled */
50void lock_kprobes(void) 155void lock_kprobes(void)
51{ 156{
@@ -78,22 +183,23 @@ struct kprobe *get_kprobe(void *addr)
78 * Aggregate handlers for multiple kprobes support - these handlers 183 * Aggregate handlers for multiple kprobes support - these handlers
79 * take care of invoking the individual kprobe handlers on p->list 184 * take care of invoking the individual kprobe handlers on p->list
80 */ 185 */
81int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 186static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
82{ 187{
83 struct kprobe *kp; 188 struct kprobe *kp;
84 189
85 list_for_each_entry(kp, &p->list, list) { 190 list_for_each_entry(kp, &p->list, list) {
86 if (kp->pre_handler) { 191 if (kp->pre_handler) {
87 curr_kprobe = kp; 192 curr_kprobe = kp;
88 kp->pre_handler(kp, regs); 193 if (kp->pre_handler(kp, regs))
89 curr_kprobe = NULL; 194 return 1;
90 } 195 }
196 curr_kprobe = NULL;
91 } 197 }
92 return 0; 198 return 0;
93} 199}
94 200
95void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 201static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
96 unsigned long flags) 202 unsigned long flags)
97{ 203{
98 struct kprobe *kp; 204 struct kprobe *kp;
99 205
@@ -107,7 +213,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
107 return; 213 return;
108} 214}
109 215
110int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) 216static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
217 int trapnr)
111{ 218{
112 /* 219 /*
113 * if we faulted "during" the execution of a user specified 220 * if we faulted "during" the execution of a user specified
@@ -120,19 +227,159 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
120 return 0; 227 return 0;
121} 228}
122 229
230static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
231{
232 struct kprobe *kp = curr_kprobe;
233 if (curr_kprobe && kp->break_handler) {
234 if (kp->break_handler(kp, regs)) {
235 curr_kprobe = NULL;
236 return 1;
237 }
238 }
239 curr_kprobe = NULL;
240 return 0;
241}
242
243struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
244{
245 struct hlist_node *node;
246 struct kretprobe_instance *ri;
247 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
248 return ri;
249 return NULL;
250}
251
252static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
253{
254 struct hlist_node *node;
255 struct kretprobe_instance *ri;
256 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
257 return ri;
258 return NULL;
259}
260
261void add_rp_inst(struct kretprobe_instance *ri)
262{
263 /*
264 * Remove rp inst off the free list -
265 * Add it back when probed function returns
266 */
267 hlist_del(&ri->uflist);
268
269 /* Add rp inst onto table */
270 INIT_HLIST_NODE(&ri->hlist);
271 hlist_add_head(&ri->hlist,
272 &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
273
274 /* Also add this rp inst to the used list. */
275 INIT_HLIST_NODE(&ri->uflist);
276 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
277}
278
279void recycle_rp_inst(struct kretprobe_instance *ri)
280{
281 /* remove rp inst off the rprobe_inst_table */
282 hlist_del(&ri->hlist);
283 if (ri->rp) {
284 /* remove rp inst off the used list */
285 hlist_del(&ri->uflist);
286 /* put rp inst back onto the free list */
287 INIT_HLIST_NODE(&ri->uflist);
288 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
289 } else
290 /* Unregistering */
291 kfree(ri);
292}
293
294struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
295{
296 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
297}
298
299/*
300 * This function is called from exit_thread or flush_thread when task tk's
301 * stack is being recycled so that we can recycle any function-return probe
302 * instances associated with this task. These left over instances represent
303 * probed functions that have been called but will never return.
304 */
305void kprobe_flush_task(struct task_struct *tk)
306{
307 struct kretprobe_instance *ri;
308 struct hlist_head *head;
309 struct hlist_node *node, *tmp;
310 unsigned long flags = 0;
311
312 spin_lock_irqsave(&kprobe_lock, flags);
313 head = kretprobe_inst_table_head(current);
314 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
315 if (ri->task == tk)
316 recycle_rp_inst(ri);
317 }
318 spin_unlock_irqrestore(&kprobe_lock, flags);
319}
320
321/*
322 * This kprobe pre_handler is registered with every kretprobe. When probe
323 * hits it will set up the return probe.
324 */
325static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
326{
327 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
328
329 /*TODO: consider to only swap the RA after the last pre_handler fired */
330 arch_prepare_kretprobe(rp, regs);
331 return 0;
332}
333
334static inline void free_rp_inst(struct kretprobe *rp)
335{
336 struct kretprobe_instance *ri;
337 while ((ri = get_free_rp_inst(rp)) != NULL) {
338 hlist_del(&ri->uflist);
339 kfree(ri);
340 }
341}
342
343/*
344 * Keep all fields in the kprobe consistent
345 */
346static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
347{
348 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
349 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
350}
351
352/*
353* Add the new probe to old_p->list. Fail if this is the
354* second jprobe at the address - two jprobes can't coexist
355*/
356static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
357{
358 struct kprobe *kp;
359
360 if (p->break_handler) {
361 list_for_each_entry(kp, &old_p->list, list) {
362 if (kp->break_handler)
363 return -EEXIST;
364 }
365 list_add_tail(&p->list, &old_p->list);
366 } else
367 list_add(&p->list, &old_p->list);
368 return 0;
369}
370
123/* 371/*
124 * Fill in the required fields of the "manager kprobe". Replace the 372 * Fill in the required fields of the "manager kprobe". Replace the
125 * earlier kprobe in the hlist with the manager kprobe 373 * earlier kprobe in the hlist with the manager kprobe
126 */ 374 */
127static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 375static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
128{ 376{
377 copy_kprobe(p, ap);
129 ap->addr = p->addr; 378 ap->addr = p->addr;
130 ap->opcode = p->opcode;
131 memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
132
133 ap->pre_handler = aggr_pre_handler; 379 ap->pre_handler = aggr_pre_handler;
134 ap->post_handler = aggr_post_handler; 380 ap->post_handler = aggr_post_handler;
135 ap->fault_handler = aggr_fault_handler; 381 ap->fault_handler = aggr_fault_handler;
382 ap->break_handler = aggr_break_handler;
136 383
137 INIT_LIST_HEAD(&ap->list); 384 INIT_LIST_HEAD(&ap->list);
138 list_add(&p->list, &ap->list); 385 list_add(&p->list, &ap->list);
@@ -153,16 +400,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
153 int ret = 0; 400 int ret = 0;
154 struct kprobe *ap; 401 struct kprobe *ap;
155 402
156 if (old_p->break_handler || p->break_handler) { 403 if (old_p->pre_handler == aggr_pre_handler) {
157 ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ 404 copy_kprobe(old_p, p);
158 } else if (old_p->pre_handler == aggr_pre_handler) { 405 ret = add_new_kprobe(old_p, p);
159 list_add(&p->list, &old_p->list);
160 } else { 406 } else {
161 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 407 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
162 if (!ap) 408 if (!ap)
163 return -ENOMEM; 409 return -ENOMEM;
164 add_aggr_kprobe(ap, old_p); 410 add_aggr_kprobe(ap, old_p);
165 list_add(&p->list, &ap->list); 411 copy_kprobe(ap, p);
412 ret = add_new_kprobe(ap, p);
166 } 413 }
167 return ret; 414 return ret;
168} 415}
@@ -170,10 +417,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
170/* kprobe removal house-keeping routines */ 417/* kprobe removal house-keeping routines */
171static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) 418static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
172{ 419{
173 *p->addr = p->opcode; 420 arch_disarm_kprobe(p);
174 hlist_del(&p->hlist); 421 hlist_del(&p->hlist);
175 flush_icache_range((unsigned long) p->addr,
176 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
177 spin_unlock_irqrestore(&kprobe_lock, flags); 422 spin_unlock_irqrestore(&kprobe_lock, flags);
178 arch_remove_kprobe(p); 423 arch_remove_kprobe(p);
179} 424}
@@ -200,6 +445,7 @@ int register_kprobe(struct kprobe *p)
200 } 445 }
201 spin_lock_irqsave(&kprobe_lock, flags); 446 spin_lock_irqsave(&kprobe_lock, flags);
202 old_p = get_kprobe(p->addr); 447 old_p = get_kprobe(p->addr);
448 p->nmissed = 0;
203 if (old_p) { 449 if (old_p) {
204 ret = register_aggr_kprobe(old_p, p); 450 ret = register_aggr_kprobe(old_p, p);
205 goto out; 451 goto out;
@@ -210,10 +456,8 @@ int register_kprobe(struct kprobe *p)
210 hlist_add_head(&p->hlist, 456 hlist_add_head(&p->hlist,
211 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 457 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
212 458
213 p->opcode = *p->addr; 459 arch_arm_kprobe(p);
214 *p->addr = BREAKPOINT_INSTRUCTION; 460
215 flush_icache_range((unsigned long) p->addr,
216 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
217out: 461out:
218 spin_unlock_irqrestore(&kprobe_lock, flags); 462 spin_unlock_irqrestore(&kprobe_lock, flags);
219rm_kprobe: 463rm_kprobe:
@@ -257,16 +501,83 @@ void unregister_jprobe(struct jprobe *jp)
257 unregister_kprobe(&jp->kp); 501 unregister_kprobe(&jp->kp);
258} 502}
259 503
504#ifdef ARCH_SUPPORTS_KRETPROBES
505
506int register_kretprobe(struct kretprobe *rp)
507{
508 int ret = 0;
509 struct kretprobe_instance *inst;
510 int i;
511
512 rp->kp.pre_handler = pre_handler_kretprobe;
513
514 /* Pre-allocate memory for max kretprobe instances */
515 if (rp->maxactive <= 0) {
516#ifdef CONFIG_PREEMPT
517 rp->maxactive = max(10, 2 * NR_CPUS);
518#else
519 rp->maxactive = NR_CPUS;
520#endif
521 }
522 INIT_HLIST_HEAD(&rp->used_instances);
523 INIT_HLIST_HEAD(&rp->free_instances);
524 for (i = 0; i < rp->maxactive; i++) {
525 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
526 if (inst == NULL) {
527 free_rp_inst(rp);
528 return -ENOMEM;
529 }
530 INIT_HLIST_NODE(&inst->uflist);
531 hlist_add_head(&inst->uflist, &rp->free_instances);
532 }
533
534 rp->nmissed = 0;
535 /* Establish function entry probe point */
536 if ((ret = register_kprobe(&rp->kp)) != 0)
537 free_rp_inst(rp);
538 return ret;
539}
540
541#else /* ARCH_SUPPORTS_KRETPROBES */
542
543int register_kretprobe(struct kretprobe *rp)
544{
545 return -ENOSYS;
546}
547
548#endif /* ARCH_SUPPORTS_KRETPROBES */
549
550void unregister_kretprobe(struct kretprobe *rp)
551{
552 unsigned long flags;
553 struct kretprobe_instance *ri;
554
555 unregister_kprobe(&rp->kp);
556 /* No race here */
557 spin_lock_irqsave(&kprobe_lock, flags);
558 free_rp_inst(rp);
559 while ((ri = get_used_rp_inst(rp)) != NULL) {
560 ri->rp = NULL;
561 hlist_del(&ri->uflist);
562 }
563 spin_unlock_irqrestore(&kprobe_lock, flags);
564}
565
260static int __init init_kprobes(void) 566static int __init init_kprobes(void)
261{ 567{
262 int i, err = 0; 568 int i, err = 0;
263 569
264 /* FIXME allocate the probe table, currently defined statically */ 570 /* FIXME allocate the probe table, currently defined statically */
265 /* initialize all list heads */ 571 /* initialize all list heads */
266 for (i = 0; i < KPROBE_TABLE_SIZE; i++) 572 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
267 INIT_HLIST_HEAD(&kprobe_table[i]); 573 INIT_HLIST_HEAD(&kprobe_table[i]);
574 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
575 }
576
577 err = arch_init();
578 if (!err)
579 err = register_die_notifier(&kprobe_exceptions_nb);
268 580
269 err = register_die_notifier(&kprobe_exceptions_nb);
270 return err; 581 return err;
271} 582}
272 583
@@ -277,3 +588,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
277EXPORT_SYMBOL_GPL(register_jprobe); 588EXPORT_SYMBOL_GPL(register_jprobe);
278EXPORT_SYMBOL_GPL(unregister_jprobe); 589EXPORT_SYMBOL_GPL(unregister_jprobe);
279EXPORT_SYMBOL_GPL(jprobe_return); 590EXPORT_SYMBOL_GPL(jprobe_return);
591EXPORT_SYMBOL_GPL(register_kretprobe);
592EXPORT_SYMBOL_GPL(unregister_kretprobe);
593
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
30KERNEL_ATTR_RO(hotplug_seqnum); 30KERNEL_ATTR_RO(hotplug_seqnum);
31#endif 31#endif
32 32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
37{
38 return sprintf(page, "%p\n", (void *)crash_notes);
39}
40KERNEL_ATTR_RO(crash_notes);
41#endif
42
33decl_subsys(kernel, NULL, NULL); 43decl_subsys(kernel, NULL, NULL);
34EXPORT_SYMBOL_GPL(kernel_subsys); 44EXPORT_SYMBOL_GPL(kernel_subsys);
35 45
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
37#ifdef CONFIG_HOTPLUG 47#ifdef CONFIG_HOTPLUG
38 &hotplug_seqnum_attr.attr, 48 &hotplug_seqnum_attr.attr,
39#endif 49#endif
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif
40 NULL 53 NULL
41}; 54};
42 55
diff --git a/kernel/module.c b/kernel/module.c
index 83b3d376708c..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/stop_machine.h> 36#include <linux/stop_machine.h>
37#include <linux/device.h> 37#include <linux/device.h>
38#include <linux/string.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/semaphore.h> 40#include <asm/semaphore.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
370#endif /* CONFIG_SMP */ 371#endif /* CONFIG_SMP */
371 372
372#ifdef CONFIG_MODULE_UNLOAD 373#ifdef CONFIG_MODULE_UNLOAD
374#define MODINFO_ATTR(field) \
375static void setup_modinfo_##field(struct module *mod, const char *s) \
376{ \
377 mod->field = kstrdup(s, GFP_KERNEL); \
378} \
379static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
380 struct module *mod, char *buffer) \
381{ \
382 return sprintf(buffer, "%s\n", mod->field); \
383} \
384static int modinfo_##field##_exists(struct module *mod) \
385{ \
386 return mod->field != NULL; \
387} \
388static void free_modinfo_##field(struct module *mod) \
389{ \
390 kfree(mod->field); \
391 mod->field = NULL; \
392} \
393static struct module_attribute modinfo_##field = { \
394 .attr = { .name = __stringify(field), .mode = 0444, \
395 .owner = THIS_MODULE }, \
396 .show = show_modinfo_##field, \
397 .setup = setup_modinfo_##field, \
398 .test = modinfo_##field##_exists, \
399 .free = free_modinfo_##field, \
400};
401
402MODINFO_ATTR(version);
403MODINFO_ATTR(srcversion);
404
405static struct module_attribute *modinfo_attrs[] = {
406 &modinfo_version,
407 &modinfo_srcversion,
408 NULL,
409};
410
373/* Init the unload section of the module. */ 411/* Init the unload section of the module. */
374static void module_unload_init(struct module *mod) 412static void module_unload_init(struct module *mod)
375{ 413{
@@ -379,7 +417,7 @@ static void module_unload_init(struct module *mod)
379 for (i = 0; i < NR_CPUS; i++) 417 for (i = 0; i < NR_CPUS; i++)
380 local_set(&mod->ref[i].count, 0); 418 local_set(&mod->ref[i].count, 0);
381 /* Hold reference count during initialization. */ 419 /* Hold reference count during initialization. */
382 local_set(&mod->ref[_smp_processor_id()].count, 1); 420 local_set(&mod->ref[raw_smp_processor_id()].count, 1);
383 /* Backwards compatibility macros put refcount during init. */ 421 /* Backwards compatibility macros put refcount during init. */
384 mod->waiter = current; 422 mod->waiter = current;
385} 423}
@@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
692 return 0; 730 return 0;
693} 731}
694 732
695int set_obsolete(const char *val, struct kernel_param *kp) 733static int set_obsolete(const char *val, struct kernel_param *kp)
696{ 734{
697 unsigned int min, max; 735 unsigned int min, max;
698 unsigned int size, maxsize; 736 unsigned int size, maxsize;
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
1031} 1069}
1032#endif 1070#endif
1033 1071
1072#ifdef CONFIG_MODULE_UNLOAD
1073static int module_add_modinfo_attrs(struct module *mod)
1074{
1075 struct module_attribute *attr;
1076 int error = 0;
1077 int i;
1078
1079 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
1080 if (!attr->test ||
1081 (attr->test && attr->test(mod)))
1082 error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
1083 }
1084 return error;
1085}
1086
1087static void module_remove_modinfo_attrs(struct module *mod)
1088{
1089 struct module_attribute *attr;
1090 int i;
1091
1092 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1093 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
1094 attr->free(mod);
1095 }
1096}
1097#endif
1034 1098
1035static int mod_sysfs_setup(struct module *mod, 1099static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam, 1100 struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
1056 if (err) 1120 if (err)
1057 goto out_unreg; 1121 goto out_unreg;
1058 1122
1123#ifdef CONFIG_MODULE_UNLOAD
1124 err = module_add_modinfo_attrs(mod);
1125 if (err)
1126 goto out_unreg;
1127#endif
1128
1059 return 0; 1129 return 0;
1060 1130
1061out_unreg: 1131out_unreg:
@@ -1066,6 +1136,9 @@ out:
1066 1136
1067static void mod_kobject_remove(struct module *mod) 1137static void mod_kobject_remove(struct module *mod)
1068{ 1138{
1139#ifdef CONFIG_MODULE_UNLOAD
1140 module_remove_modinfo_attrs(mod);
1141#endif
1069 module_remove_refcnt_attr(mod); 1142 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod); 1143 module_param_sysfs_remove(mod);
1071 1144
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
1311 return NULL; 1384 return NULL;
1312} 1385}
1313 1386
1387#ifdef CONFIG_MODULE_UNLOAD
1388static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1389 unsigned int infoindex)
1390{
1391 struct module_attribute *attr;
1392 int i;
1393
1394 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1395 if (attr->setup)
1396 attr->setup(mod,
1397 get_modinfo(sechdrs,
1398 infoindex,
1399 attr->attr.name));
1400 }
1401}
1402#endif
1403
1314#ifdef CONFIG_KALLSYMS 1404#ifdef CONFIG_KALLSYMS
1315int is_exported(const char *name, const struct module *mod) 1405int is_exported(const char *name, const struct module *mod)
1316{ 1406{
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
1615 /* Set up license info based on the info section */ 1705 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1706 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617 1707
1708#ifdef CONFIG_MODULE_UNLOAD
1709 /* Set up MODINFO_ATTR fields */
1710 setup_modinfo(mod, sechdrs, infoindex);
1711#endif
1712
1618 /* Fix up syms, so that st_value is a pointer to location. */ 1713 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1714 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod); 1715 mod);
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
18#include <linux/sysrq.h> 18#include <linux/sysrq.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h>
21 22
22int panic_timeout; 23int panic_timeout;
23int panic_on_oops; 24int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
63 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif 65#endif
65 66
67 /*
68 * It's possible to come here directly from a panic-assertion and not
69 * have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though...
71 */
72 preempt_disable();
73
66 bust_spinlocks(1); 74 bust_spinlocks(1);
67 va_start(args, fmt); 75 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 78 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0); 79 bust_spinlocks(0);
72 80
81 /*
82 * If we have crashed and we have a crash kernel loaded let it handle
83 * everything else.
84 * Do we want to call this before we try to display a message?
85 */
86 crash_kexec(NULL);
87
73#ifdef CONFIG_SMP 88#ifdef CONFIG_SMP
89 /*
90 * Note smp_send_stop is the usual smp shutdown function, which
91 * unfortunately means it may not be hardened to work in a panic
92 * situation.
93 */
74 smp_send_stop(); 94 smp_send_stop();
75#endif 95#endif
76 96
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
79 if (!panic_blink) 99 if (!panic_blink)
80 panic_blink = no_blink; 100 panic_blink = no_blink;
81 101
82 if (panic_timeout > 0) 102 if (panic_timeout > 0) {
83 {
84 /* 103 /*
85 * Delay timeout seconds before rebooting the machine. 104 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked.. 105 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/params.c b/kernel/params.c
index 5513844bec13..d586c35ef8fc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
629 mk = to_module_kobject(kobj); 629 mk = to_module_kobject(kobj);
630 630
631 if (!attribute->show) 631 if (!attribute->show)
632 return -EPERM; 632 return -EIO;
633 633
634 if (!try_module_get(mk->mod)) 634 if (!try_module_get(mk->mod))
635 return -ENODEV; 635 return -ENODEV;
@@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
653 mk = to_module_kobject(kobj); 653 mk = to_module_kobject(kobj);
654 654
655 if (!attribute->store) 655 if (!attribute->store)
656 return -EPERM; 656 return -EIO;
657 657
658 if (!try_module_get(mk->mod)) 658 if (!try_module_get(mk->mod))
659 return -ENODEV; 659 return -ENODEV;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index cabb63fc9e16..5b7b4736d82b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -89,23 +89,6 @@ static struct idr posix_timers_id;
89static DEFINE_SPINLOCK(idr_lock); 89static DEFINE_SPINLOCK(idr_lock);
90 90
91/* 91/*
92 * Just because the timer is not in the timer list does NOT mean it is
93 * inactive. It could be in the "fire" routine getting a new expire time.
94 */
95#define TIMER_INACTIVE 1
96
97#ifdef CONFIG_SMP
98# define timer_active(tmr) \
99 ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
100# define set_timer_inactive(tmr) \
101 do { \
102 (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
103 } while (0)
104#else
105# define timer_active(tmr) BARFY // error to use outside of SMP
106# define set_timer_inactive(tmr) do { } while (0)
107#endif
108/*
109 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 92 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
110 * SIGEV values. Here we put out an error if this assumption fails. 93 * SIGEV values. Here we put out an error if this assumption fails.
111 */ 94 */
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
226 init_timer(&new_timer->it.real.timer); 209 init_timer(&new_timer->it.real.timer);
227 new_timer->it.real.timer.data = (unsigned long) new_timer; 210 new_timer->it.real.timer.data = (unsigned long) new_timer;
228 new_timer->it.real.timer.function = posix_timer_fn; 211 new_timer->it.real.timer.function = posix_timer_fn;
229 set_timer_inactive(new_timer);
230 return 0; 212 return 0;
231} 213}
232 214
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
480 int do_notify = 1; 462 int do_notify = 1;
481 463
482 spin_lock_irqsave(&timr->it_lock, flags); 464 spin_lock_irqsave(&timr->it_lock, flags);
483 set_timer_inactive(timr);
484 if (!list_empty(&timr->it.real.abs_timer_entry)) { 465 if (!list_empty(&timr->it.real.abs_timer_entry)) {
485 spin_lock(&abs_list.lock); 466 spin_lock(&abs_list.lock);
486 do { 467 do {
@@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags,
983 * careful here. If smp we could be in the "fire" routine which will 964 * careful here. If smp we could be in the "fire" routine which will
984 * be spinning as we hold the lock. But this is ONLY an SMP issue. 965 * be spinning as we hold the lock. But this is ONLY an SMP issue.
985 */ 966 */
967 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
986#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
987 if (timer_active(timr) && !del_timer(&timr->it.real.timer))
988 /* 969 /*
989 * It can only be active if on an other cpu. Since 970 * It can only be active if on an other cpu. Since
990 * we have cleared the interval stuff above, it should 971 * we have cleared the interval stuff above, it should
@@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags,
994 * a "retry" exit status. 975 * a "retry" exit status.
995 */ 976 */
996 return TIMER_RETRY; 977 return TIMER_RETRY;
997
998 set_timer_inactive(timr);
999#else
1000 del_timer(&timr->it.real.timer);
1001#endif 978#endif
979 }
980
1002 remove_from_abslist(timr); 981 remove_from_abslist(timr);
1003 982
1004 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 983 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
@@ -1083,8 +1062,9 @@ retry:
1083static inline int common_timer_del(struct k_itimer *timer) 1062static inline int common_timer_del(struct k_itimer *timer)
1084{ 1063{
1085 timer->it.real.incr = 0; 1064 timer->it.real.incr = 0;
1065
1066 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
1086#ifdef CONFIG_SMP 1067#ifdef CONFIG_SMP
1087 if (timer_active(timer) && !del_timer(&timer->it.real.timer))
1088 /* 1068 /*
1089 * It can only be active if on an other cpu. Since 1069 * It can only be active if on an other cpu. Since
1090 * we have cleared the interval stuff above, it should 1070 * we have cleared the interval stuff above, it should
@@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer)
1094 * a "retry" exit status. 1074 * a "retry" exit status.
1095 */ 1075 */
1096 return TIMER_RETRY; 1076 return TIMER_RETRY;
1097#else
1098 del_timer(&timer->it.real.timer);
1099#endif 1077#endif
1078 }
1079
1100 remove_from_abslist(timer); 1080 remove_from_abslist(timer);
1101 1081
1102 return 0; 1082 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
27 like suspend support. 27 like suspend support.
28 28
29config SOFTWARE_SUSPEND 29config SOFTWARE_SUSPEND
30 bool "Software Suspend (EXPERIMENTAL)" 30 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP 31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
32 ---help--- 32 ---help---
33 Enable the possibility of suspending the machine. 33 Enable the possibility of suspending the machine.
34 It doesn't need APM. 34 It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
72 suspended image to. It will simply pick the first available swap 72 suspended image to. It will simply pick the first available swap
73 device. 73 device.
74 74
75config SUSPEND_SMP
76 bool
77 depends on HOTPLUG_CPU && X86 && PM
78 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6swsusp-smp-$(CONFIG_SMP) += smp.o
7
8obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o pm.o
9obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o 7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
8
9obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 10
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
117{ 117{
118 device_resume(); 118 device_resume();
119 platform_finish(); 119 platform_finish();
120 enable_nonboot_cpus();
121 thaw_processes(); 120 thaw_processes();
121 enable_nonboot_cpus();
122 pm_restore_console(); 122 pm_restore_console();
123} 123}
124 124
@@ -131,28 +131,35 @@ static int prepare_processes(void)
131 131
132 sys_sync(); 132 sys_sync();
133 133
134 disable_nonboot_cpus();
135
134 if (freeze_processes()) { 136 if (freeze_processes()) {
135 error = -EBUSY; 137 error = -EBUSY;
136 return error; 138 goto thaw;
137 } 139 }
138 140
139 if (pm_disk_mode == PM_DISK_PLATFORM) { 141 if (pm_disk_mode == PM_DISK_PLATFORM) {
140 if (pm_ops && pm_ops->prepare) { 142 if (pm_ops && pm_ops->prepare) {
141 if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) 143 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
142 return error; 144 goto thaw;
143 } 145 }
144 } 146 }
145 147
146 /* Free memory before shutting down devices. */ 148 /* Free memory before shutting down devices. */
147 free_some_memory(); 149 free_some_memory();
148
149 return 0; 150 return 0;
151thaw:
152 thaw_processes();
153 enable_nonboot_cpus();
154 pm_restore_console();
155 return error;
150} 156}
151 157
152static void unprepare_processes(void) 158static void unprepare_processes(void)
153{ 159{
154 enable_nonboot_cpus(); 160 platform_finish();
155 thaw_processes(); 161 thaw_processes();
162 enable_nonboot_cpus();
156 pm_restore_console(); 163 pm_restore_console();
157} 164}
158 165
@@ -160,15 +167,9 @@ static int prepare_devices(void)
160{ 167{
161 int error; 168 int error;
162 169
163 disable_nonboot_cpus(); 170 if ((error = device_suspend(PMSG_FREEZE)))
164 if ((error = device_suspend(PMSG_FREEZE))) {
165 printk("Some devices failed to suspend\n"); 171 printk("Some devices failed to suspend\n");
166 platform_finish(); 172 return error;
167 enable_nonboot_cpus();
168 return error;
169 }
170
171 return 0;
172} 173}
173 174
174/** 175/**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
185 int error; 186 int error;
186 187
187 error = prepare_processes(); 188 error = prepare_processes();
188 if (!error) { 189 if (error)
189 error = prepare_devices(); 190 return error;
190 } 191 error = prepare_devices();
191 192
192 if (error) { 193 if (error) {
193 unprepare_processes(); 194 unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
250 251
251 if ((error = prepare_processes())) { 252 if ((error = prepare_processes())) {
252 swsusp_close(); 253 swsusp_close();
253 goto Cleanup; 254 goto Done;
254 } 255 }
255 256
256 pr_debug("PM: Reading swsusp image.\n"); 257 pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
55 55
56 pm_prepare_console(); 56 pm_prepare_console();
57 57
58 disable_nonboot_cpus();
59
60 if (num_online_cpus() != 1) {
61 error = -EPERM;
62 goto Enable_cpu;
63 }
64
58 if (freeze_processes()) { 65 if (freeze_processes()) {
59 error = -EAGAIN; 66 error = -EAGAIN;
60 goto Thaw; 67 goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
75 pm_ops->finish(state); 82 pm_ops->finish(state);
76 Thaw: 83 Thaw:
77 thaw_processes(); 84 thaw_processes();
85 Enable_cpu:
86 enable_nonboot_cpus();
78 pm_restore_console(); 87 pm_restore_console();
79 return error; 88 return error;
80} 89}
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
113 if (pm_ops && pm_ops->finish) 122 if (pm_ops && pm_ops->finish)
114 pm_ops->finish(state); 123 pm_ops->finish(state);
115 thaw_processes(); 124 thaw_processes();
125 enable_nonboot_cpus();
116 pm_restore_console(); 126 pm_restore_console();
117} 127}
118 128
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
150 goto Unlock; 160 goto Unlock;
151 } 161 }
152 162
153 /* Suspend is hard to get right on SMP. */
154 if (num_online_cpus() != 1) {
155 error = -EPERM;
156 goto Unlock;
157 }
158
159 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 163 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
160 if ((error = suspend_prepare(state))) 164 if ((error = suspend_prepare(state)))
161 goto Unlock; 165 goto Unlock;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
32} 32}
33 33
34/* Refrigerator is place where frozen processes are stored :-). */ 34/* Refrigerator is place where frozen processes are stored :-). */
35void refrigerator(unsigned long flag) 35void refrigerator(void)
36{ 36{
37 /* Hmm, should we be allowed to suspend when there are realtime 37 /* Hmm, should we be allowed to suspend when there are realtime
38 processes around? */ 38 processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
41 current->state = TASK_UNINTERRUPTIBLE; 41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 42 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 43 printk("=");
44 current->flags &= ~PF_FREEZE;
45 44
45 frozen_process(current);
46 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 48 spin_unlock_irq(&current->sighand->siglock);
49 49
50 current->flags |= PF_FROZEN; 50 while (frozen(current))
51 while (current->flags & PF_FROZEN)
52 schedule(); 51 schedule();
53 pr_debug("%s left refrigerator\n", current->comm); 52 pr_debug("%s left refrigerator\n", current->comm);
54 current->state = save; 53 current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
57/* 0 = success, else # of processes that we failed to stop */ 56/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 57int freeze_processes(void)
59{ 58{
60 int todo; 59 int todo;
61 unsigned long start_time; 60 unsigned long start_time;
62 struct task_struct *g, *p; 61 struct task_struct *g, *p;
63 62
64 printk( "Stopping tasks: " ); 63 printk( "Stopping tasks: " );
65 start_time = jiffies; 64 start_time = jiffies;
66 do { 65 do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
70 unsigned long flags; 69 unsigned long flags;
71 if (!freezeable(p)) 70 if (!freezeable(p))
72 continue; 71 continue;
73 if ((p->flags & PF_FROZEN) || 72 if ((frozen(p)) ||
74 (p->state == TASK_TRACED) || 73 (p->state == TASK_TRACED) ||
75 (p->state == TASK_STOPPED)) 74 (p->state == TASK_STOPPED))
76 continue; 75 continue;
77 76
78 /* FIXME: smp problem here: we may not access other process' flags 77 freeze(p);
79 without locking */
80 p->flags |= PF_FREEZE;
81 spin_lock_irqsave(&p->sighand->siglock, flags); 78 spin_lock_irqsave(&p->sighand->siglock, flags);
82 signal_wake_up(p, 0); 79 signal_wake_up(p, 0);
83 spin_unlock_irqrestore(&p->sighand->siglock, flags); 80 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
91 return todo; 88 return todo;
92 } 89 }
93 } while(todo); 90 } while(todo);
94 91
95 printk( "|\n" ); 92 printk( "|\n" );
96 BUG_ON(in_atomic()); 93 BUG_ON(in_atomic());
97 return 0; 94 return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
106 do_each_thread(g, p) { 103 do_each_thread(g, p) {
107 if (!freezeable(p)) 104 if (!freezeable(p))
108 continue; 105 continue;
109 if (p->flags & PF_FROZEN) { 106 if (!thaw_process(p))
110 p->flags &= ~PF_FROZEN;
111 wake_up_process(p);
112 } else
113 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 107 printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
114 } while_each_thread(g, p); 108 } while_each_thread(g, p);
115 109
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80fe..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/suspend.h> 14#include <linux/suspend.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/cpu.h>
16#include <asm/atomic.h> 17#include <asm/atomic.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19static atomic_t cpu_counter, freeze; 20/* This is protected by pm_sem semaphore */
20 21static cpumask_t frozen_cpus;
21
22static void smp_pause(void * data)
23{
24 struct saved_context ctxt;
25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n");
27 dump_stack();
28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and
32 preserve general-purpose registers, too
33
34 What about stack? We may need to move to new stack here.
35
36 This should better be ran with interrupts disabled.
37 */
38 cpu_relax();
39 barrier();
40 }
41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt);
43}
44
45static cpumask_t oldmask;
46 22
47void disable_nonboot_cpus(void) 23void disable_nonboot_cpus(void)
48{ 24{
49 oldmask = current->cpus_allowed; 25 int cpu, error;
50 set_cpus_allowed(current, cpumask_of_cpu(0));
51 printk("Freezing CPUs (at %d)", _smp_processor_id());
52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ);
54 printk("...");
55 BUG_ON(_smp_processor_id() != 0);
56
57 /* FIXME: for this to work, all the CPUs must be running
58 * "idle" thread (or we deadlock). Is that guaranteed? */
59 26
60 atomic_set(&cpu_counter, 0); 27 error = 0;
61 atomic_set(&freeze, 1); 28 cpus_clear(frozen_cpus);
62 smp_call_function(smp_pause, NULL, 0, 0); 29 printk("Freezing cpus ...\n");
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { 30 for_each_online_cpu(cpu) {
64 cpu_relax(); 31 if (cpu == 0)
65 barrier(); 32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
66 } 40 }
67 printk("ok\n"); 41 BUG_ON(smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
68} 44}
69 45
70void enable_nonboot_cpus(void) 46void enable_nonboot_cpus(void)
71{ 47{
72 printk("Restarting CPUs"); 48 int cpu, error;
73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) {
75 cpu_relax();
76 barrier();
77 }
78 printk("...");
79 set_cpus_allowed(current, oldmask);
80 schedule();
81 printk("ok\n");
82 49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) {
56 printk("CPU%d is up\n", cpu);
57 continue;
58 }
59 printk("Error taking cpu %d up: %d\n", cpu, error);
60 panic("Not enough cpus");
61 }
62 cpus_clear(frozen_cpus);
83} 63}
84 64
85
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
11 * 11 *
12 * I'd like to thank the following people for their work: 12 * I'd like to thank the following people for their work:
13 * 13 *
14 * Pavel Machek <pavel@ucw.cz>: 14 * Pavel Machek <pavel@ucw.cz>:
15 * Modifications, defectiveness pointing, being with me at the very beginning, 15 * Modifications, defectiveness pointing, being with me at the very beginning,
16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. 16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17 * 17 *
18 * Steve Doddi <dirk@loth.demon.co.uk>: 18 * Steve Doddi <dirk@loth.demon.co.uk>:
19 * Support the possibility of hardware state restoring. 19 * Support the possibility of hardware state restoring.
20 * 20 *
21 * Raph <grey.havens@earthling.net>: 21 * Raph <grey.havens@earthling.net>:
@@ -81,14 +81,14 @@ static int nr_copy_pages_check;
81extern char resume_file[]; 81extern char resume_file[];
82 82
83/* Local variables that should not be affected by save */ 83/* Local variables that should not be affected by save */
84unsigned int nr_copy_pages __nosavedata = 0; 84static unsigned int nr_copy_pages __nosavedata = 0;
85 85
86/* Suspend pagedir is allocated before final copy, therefore it 86/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume 87 must be freed after resume
88 88
89 Warning: this is evil. There are actually two pagedirs at time of 89 Warning: this is evil. There are actually two pagedirs at time of
90 resume. One is "pagedir_save", which is empty frame allocated at 90 resume. One is "pagedir_save", which is empty frame allocated at
91 time of suspend, that must be freed. Second is "pagedir_nosave", 91 time of suspend, that must be freed. Second is "pagedir_nosave",
92 allocated at time of resume, that travels through memory not to 92 allocated at time of resume, that travels through memory not to
93 collide with anything. 93 collide with anything.
94 94
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
132{ 132{
133 int error; 133 int error;
134 134
135 rw_swap_page_sync(READ, 135 rw_swap_page_sync(READ,
136 swp_entry(root_swap, 0), 136 swp_entry(root_swap, 0),
137 virt_to_page((unsigned long)&swsusp_header)); 137 virt_to_page((unsigned long)&swsusp_header));
138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
142 swsusp_header.swsusp_info = prev; 142 swsusp_header.swsusp_info = prev;
143 error = rw_swap_page_sync(WRITE, 143 error = rw_swap_page_sync(WRITE,
144 swp_entry(root_swap, 0), 144 swp_entry(root_swap, 0),
145 virt_to_page((unsigned long) 145 virt_to_page((unsigned long)
146 &swsusp_header)); 146 &swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
174static int swsusp_swap_check(void) /* This is called before saving image */ 174static int swsusp_swap_check(void) /* This is called before saving image */
175{ 175{
176 int i, len; 176 int i, len;
177 177
178 len=strlen(resume_file); 178 len=strlen(resume_file);
179 root_swap = 0xFFFF; 179 root_swap = 0xFFFF;
180 180
181 swap_list_lock(); 181 swap_list_lock();
182 for(i=0; i<MAX_SWAPFILES; i++) { 182 for (i=0; i<MAX_SWAPFILES; i++) {
183 if (swap_info[i].flags == 0) { 183 if (swap_info[i].flags == 0) {
184 swapfile_used[i]=SWAPFILE_UNUSED; 184 swapfile_used[i]=SWAPFILE_UNUSED;
185 } else { 185 } else {
186 if(!len) { 186 if (!len) {
187 printk(KERN_WARNING "resume= option should be used to set suspend device" ); 187 printk(KERN_WARNING "resume= option should be used to set suspend device" );
188 if(root_swap == 0xFFFF) { 188 if (root_swap == 0xFFFF) {
189 swapfile_used[i] = SWAPFILE_SUSPEND; 189 swapfile_used[i] = SWAPFILE_SUSPEND;
190 root_swap = i; 190 root_swap = i;
191 } else 191 } else
192 swapfile_used[i] = SWAPFILE_IGNORED; 192 swapfile_used[i] = SWAPFILE_IGNORED;
193 } else { 193 } else {
194 /* we ignore all swap devices that are not the resume_file */ 194 /* we ignore all swap devices that are not the resume_file */
195 if (is_resume_device(&swap_info[i])) { 195 if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
209 * This is called after saving image so modification 209 * This is called after saving image so modification
210 * will be lost after resume... and that's what we want. 210 * will be lost after resume... and that's what we want.
211 * we make the device unusable. A new call to 211 * we make the device unusable. A new call to
212 * lock_swapdevices can unlock the devices. 212 * lock_swapdevices can unlock the devices.
213 */ 213 */
214static void lock_swapdevices(void) 214static void lock_swapdevices(void)
215{ 215{
216 int i; 216 int i;
217 217
218 swap_list_lock(); 218 swap_list_lock();
219 for(i = 0; i< MAX_SWAPFILES; i++) 219 for (i = 0; i< MAX_SWAPFILES; i++)
220 if(swapfile_used[i] == SWAPFILE_IGNORED) { 220 if (swapfile_used[i] == SWAPFILE_IGNORED) {
221 swap_info[i].flags ^= 0xFF; 221 swap_info[i].flags ^= 0xFF;
222 } 222 }
223 swap_list_unlock(); 223 swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
229 * @loc: Place to store the entry we used. 229 * @loc: Place to store the entry we used.
230 * 230 *
231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO 231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
232 * errors. That is an artifact left over from swsusp. It did not 232 * errors. That is an artifact left over from swsusp. It did not
233 * check the return of rw_swap_page_sync() at all, since most pages 233 * check the return of rw_swap_page_sync() at all, since most pages
234 * written back to swap would return -EIO. 234 * written back to swap would return -EIO.
235 * This is a partial improvement, since we will at least return other 235 * This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
241 int error = 0; 241 int error = 0;
242 242
243 entry = get_swap_page(); 243 entry = get_swap_page();
244 if (swp_offset(entry) && 244 if (swp_offset(entry) &&
245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
246 error = rw_swap_page_sync(WRITE, entry, 246 error = rw_swap_page_sync(WRITE, entry,
247 virt_to_page(addr)); 247 virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
257/** 257/**
258 * data_free - Free the swap entries used by the saved image. 258 * data_free - Free the swap entries used by the saved image.
259 * 259 *
260 * Walk the list of used swap entries and free each one. 260 * Walk the list of used swap entries and free each one.
261 * This is only used for cleanup when suspend fails. 261 * This is only used for cleanup when suspend fails.
262 */ 262 */
263static void data_free(void) 263static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
290 mod = 1; 290 mod = 1;
291 291
292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
293 for_each_pbe(p, pagedir_nosave) { 293 for_each_pbe (p, pagedir_nosave) {
294 if (!(i%mod)) 294 if (!(i%mod))
295 printk( "\b\b\b\b%3d%%", i / mod ); 295 printk( "\b\b\b\b%3d%%", i / mod );
296 if ((error = write_page(p->address, &(p->swap_address)))) 296 if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
335 335
336 dump_info(); 336 dump_info();
337 error = write_page((unsigned long)&swsusp_info, &entry); 337 error = write_page((unsigned long)&swsusp_info, &entry);
338 if (!error) { 338 if (!error) {
339 printk( "S" ); 339 printk( "S" );
340 error = mark_swapfiles(entry); 340 error = mark_swapfiles(entry);
341 printk( "|\n" ); 341 printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
370 struct pbe * pbe; 370 struct pbe * pbe;
371 371
372 printk( "Writing pagedir..."); 372 printk( "Writing pagedir...");
373 for_each_pb_page(pbe, pagedir_nosave) { 373 for_each_pb_page (pbe, pagedir_nosave) {
374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
375 return error; 375 return error;
376 } 376 }
@@ -472,7 +472,7 @@ static int save_highmem(void)
472 int res = 0; 472 int res = 0;
473 473
474 pr_debug("swsusp: Saving Highmem\n"); 474 pr_debug("swsusp: Saving Highmem\n");
475 for_each_zone(zone) { 475 for_each_zone (zone) {
476 if (is_highmem(zone)) 476 if (is_highmem(zone))
477 res = save_highmem_zone(zone); 477 res = save_highmem_zone(zone);
478 if (res) 478 if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
547 547
548 nr_copy_pages = 0; 548 nr_copy_pages = 0;
549 549
550 for_each_zone(zone) { 550 for_each_zone (zone) {
551 if (is_highmem(zone)) 551 if (is_highmem(zone))
552 continue; 552 continue;
553 mark_free_pages(zone); 553 mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
562 struct zone *zone; 562 struct zone *zone;
563 unsigned long zone_pfn; 563 unsigned long zone_pfn;
564 struct pbe * pbe = pagedir_nosave; 564 struct pbe * pbe = pagedir_nosave;
565 565
566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); 566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
567 for_each_zone(zone) { 567 for_each_zone (zone) {
568 if (is_highmem(zone)) 568 if (is_highmem(zone))
569 continue; 569 continue;
570 mark_free_pages(zone); 570 mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
702{ 702{
703 struct pbe * p; 703 struct pbe * p;
704 704
705 for_each_pbe(p, pagedir_save) { 705 for_each_pbe (p, pagedir_save) {
706 if (p->address) { 706 if (p->address) {
707 ClearPageNosave(virt_to_page(p->address)); 707 ClearPageNosave(virt_to_page(p->address));
708 free_page(p->address); 708 free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
719{ 719{
720 struct pbe * p; 720 struct pbe * p;
721 721
722 for_each_pbe(p, pagedir_save) { 722 for_each_pbe (p, pagedir_save) {
723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); 723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
724 if (!p->address) 724 if (!p->address)
725 return -ENOMEM; 725 return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
740/** 740/**
741 * enough_free_mem - Make sure we enough free memory to snapshot. 741 * enough_free_mem - Make sure we enough free memory to snapshot.
742 * 742 *
743 * Returns TRUE or FALSE after checking the number of available 743 * Returns TRUE or FALSE after checking the number of available
744 * free pages. 744 * free pages.
745 */ 745 */
746 746
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
758/** 758/**
759 * enough_swap - Make sure we have enough swap to save the image. 759 * enough_swap - Make sure we have enough swap to save the image.
760 * 760 *
761 * Returns TRUE or FALSE after checking the total amount of swap 761 * Returns TRUE or FALSE after checking the total amount of swap
762 * space avaiable. 762 * space avaiable.
763 * 763 *
764 * FIXME: si_swapinfo(&i) returns all swap devices information. 764 * FIXME: si_swapinfo(&i) returns all swap devices information.
765 * We should only consider resume_device. 765 * We should only consider resume_device.
766 */ 766 */
767 767
768static int enough_swap(void) 768static int enough_swap(void)
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
781{ 781{
782 int error; 782 int error;
783 783
784 pagedir_nosave = NULL;
785 nr_copy_pages = calc_nr(nr_copy_pages);
786
784 pr_debug("suspend: (pages needed: %d + %d free: %d)\n", 787 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
785 nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); 788 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
786 789
787 pagedir_nosave = NULL;
788 if (!enough_free_mem()) 790 if (!enough_free_mem())
789 return -ENOMEM; 791 return -ENOMEM;
790 792
791 if (!enough_swap()) 793 if (!enough_swap())
792 return -ENOSPC; 794 return -ENOSPC;
793 795
794 nr_copy_pages = calc_nr(nr_copy_pages);
795
796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { 796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
798 return -ENOMEM; 798 return -ENOMEM;
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
827 error = swsusp_alloc(); 827 error = swsusp_alloc();
828 if (error) 828 if (error)
829 return error; 829 return error;
830 830
831 /* During allocating of suspend pagedir, new cold pages may appear. 831 /* During allocating of suspend pagedir, new cold pages may appear.
832 * Kill them. 832 * Kill them.
833 */ 833 */
834 drain_local_pages(); 834 drain_local_pages();
@@ -929,21 +929,6 @@ int swsusp_resume(void)
929 return error; 929 return error;
930} 930}
931 931
932/* More restore stuff */
933
934/*
935 * Returns true if given address/order collides with any orig_address
936 */
937static int does_collide_order(unsigned long addr, int order)
938{
939 int i;
940
941 for (i=0; i < (1<<order); i++)
942 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
943 return 1;
944 return 0;
945}
946
947/** 932/**
948 * On resume, for storing the PBE list and the image, 933 * On resume, for storing the PBE list and the image,
949 * we can only use memory pages that do not conflict with the pages 934 * we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
973 unsigned long m; 958 unsigned long m;
974 959
975 m = get_zeroed_page(gfp_mask); 960 m = get_zeroed_page(gfp_mask);
976 while (does_collide_order(m, 0)) { 961 while (!PageNosaveFree(virt_to_page(m))) {
977 eat_page((void *)m); 962 eat_page((void *)m);
978 m = get_zeroed_page(gfp_mask); 963 m = get_zeroed_page(gfp_mask);
979 if (!m) 964 if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1045 1030
1046 /* Set page flags */ 1031 /* Set page flags */
1047 1032
1048 for_each_zone(zone) { 1033 for_each_zone (zone) {
1049 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1034 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1050 SetPageNosaveFree(pfn_to_page(zone_pfn + 1035 SetPageNosaveFree(pfn_to_page(zone_pfn +
1051 zone->zone_start_pfn)); 1036 zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1061 /* Relocate colliding pages */ 1046 /* Relocate colliding pages */
1062 1047
1063 for_each_pb_page (pbpage, pblist) { 1048 for_each_pb_page (pbpage, pblist) {
1064 if (does_collide_order((unsigned long)pbpage, 0)) { 1049 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1065 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); 1050 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1066 if (!m) { 1051 if (!m) {
1067 error = -ENOMEM; 1052 error = -ENOMEM;
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
1193 return "version"; 1178 return "version";
1194 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1195 return "machine"; 1180 return "machine";
1181#if 0
1196 if(swsusp_info.cpus != num_online_cpus()) 1182 if(swsusp_info.cpus != num_online_cpus())
1197 return "number of cpus"; 1183 return "number of cpus";
1184#endif
1198 return NULL; 1185 return NULL;
1199} 1186}
1200 1187
diff --git a/kernel/printk.c b/kernel/printk.c
index 01b58d7d17ff..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
588 log_level_unknown = 1; 588 log_level_unknown = 1;
589 } 589 }
590 590
591 if (!cpu_online(smp_processor_id()) && 591 if (!cpu_online(smp_processor_id())) {
592 system_state != SYSTEM_RUNNING) {
593 /* 592 /*
594 * Some console drivers may assume that per-cpu resources have 593 * Some console drivers may assume that per-cpu resources have
595 * been allocated. So don't allow them to be called by this 594 * been allocated. So don't allow them to be called by this
@@ -876,8 +875,10 @@ void register_console(struct console * console)
876 break; 875 break;
877 console->flags |= CON_ENABLED; 876 console->flags |= CON_ENABLED;
878 console->index = console_cmdline[i].index; 877 console->index = console_cmdline[i].index;
879 if (i == preferred_console) 878 if (i == selected_console) {
880 console->flags |= CON_CONSDEV; 879 console->flags |= CON_CONSDEV;
880 preferred_console = selected_console;
881 }
881 break; 882 break;
882 } 883 }
883 884
@@ -897,6 +898,8 @@ void register_console(struct console * console)
897 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 898 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
898 console->next = console_drivers; 899 console->next = console_drivers;
899 console_drivers = console; 900 console_drivers = console;
901 if (console->next)
902 console->next->flags &= ~CON_CONSDEV;
900 } else { 903 } else {
901 console->next = console_drivers->next; 904 console->next = console_drivers->next;
902 console_drivers->next = console; 905 console_drivers->next = console;
@@ -937,10 +940,14 @@ int unregister_console(struct console * console)
937 /* If last console is removed, we re-enable picking the first 940 /* If last console is removed, we re-enable picking the first
938 * one that gets registered. Without that, pmac early boot console 941 * one that gets registered. Without that, pmac early boot console
939 * would prevent fbcon from taking over. 942 * would prevent fbcon from taking over.
943 *
944 * If this isn't the last console and it has CON_CONSDEV set, we
945 * need to set it on the next preferred console.
940 */ 946 */
941 if (console_drivers == NULL) 947 if (console_drivers == NULL)
942 preferred_console = selected_console; 948 preferred_console = selected_console;
943 949 else if (console->flags & CON_CONSDEV)
950 console_drivers->flags |= CON_CONSDEV;
944 951
945 release_console_sem(); 952 release_console_sem();
946 return res; 953 return res;
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
263 new->start = min; 263 new->start = min;
264 if (new->end > max) 264 if (new->end > max)
265 new->end = max; 265 new->end = max;
266 new->start = (new->start + align - 1) & ~(align - 1); 266 new->start = ALIGN(new->start, align);
267 if (alignf) 267 if (alignf)
268 alignf(alignf_data, new, size, align); 268 alignf(alignf_data, new, size, align);
269 if (new->start < new->end && new->end - new->start >= size - 1) { 269 if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index f12a0c8a7d98..5f2182d42241 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
166#define SCALE_PRIO(x, prio) \ 166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169static inline unsigned int task_timeslice(task_t *p) 169static unsigned int task_timeslice(task_t *p)
170{ 170{
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -260,23 +260,87 @@ struct runqueue {
260 260
261static DEFINE_PER_CPU(struct runqueue, runqueues); 261static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See detach_destroy_domains: synchronize_sched for details.
266 *
267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections.
269 */
263#define for_each_domain(cpu, domain) \ 270#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 271for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
265 272
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 273#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues)) 274#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p)) 275#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 276#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 277
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch 278#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0) 279# define prepare_arch_switch(next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 280#endif
277# define task_running(rq, p) ((rq)->curr == (p)) 281#ifndef finish_arch_switch
282# define finish_arch_switch(prev) do { } while (0)
278#endif 283#endif
279 284
285#ifndef __ARCH_WANT_UNLOCKED_CTXSW
286static inline int task_running(runqueue_t *rq, task_t *p)
287{
288 return rq->curr == p;
289}
290
291static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
292{
293}
294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{
297 spin_unlock_irq(&rq->lock);
298}
299
300#else /* __ARCH_WANT_UNLOCKED_CTXSW */
301static inline int task_running(runqueue_t *rq, task_t *p)
302{
303#ifdef CONFIG_SMP
304 return p->oncpu;
305#else
306 return rq->curr == p;
307#endif
308}
309
310static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
311{
312#ifdef CONFIG_SMP
313 /*
314 * We can optimise this out completely for !SMP, because the
315 * SMP rebalancing from interrupt is the only thing that cares
316 * here.
317 */
318 next->oncpu = 1;
319#endif
320#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
321 spin_unlock_irq(&rq->lock);
322#else
323 spin_unlock(&rq->lock);
324#endif
325}
326
327static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
328{
329#ifdef CONFIG_SMP
330 /*
331 * After ->oncpu is cleared, the task can be moved to a different CPU.
332 * We must ensure this doesn't happen until the switch is completely
333 * finished.
334 */
335 smp_wmb();
336 prev->oncpu = 0;
337#endif
338#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
339 local_irq_enable();
340#endif
341}
342#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
343
280/* 344/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 345 * task_rq_lock - lock the runqueue a given task resides on and disable
282 * interrupts. Note the ordering: we can safely lookup the task_rq without 346 * interrupts. Note the ordering: we can safely lookup the task_rq without
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
309 * bump this up when changing the output format or the meaning of an existing 373 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 374 * format, so that tools can adapt (or abort)
311 */ 375 */
312#define SCHEDSTAT_VERSION 11 376#define SCHEDSTAT_VERSION 12
313 377
314static int show_schedstat(struct seq_file *seq, void *v) 378static int show_schedstat(struct seq_file *seq, void *v)
315{ 379{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
338 402
339#ifdef CONFIG_SMP 403#ifdef CONFIG_SMP
340 /* domain-specific stats */ 404 /* domain-specific stats */
405 preempt_disable();
341 for_each_domain(cpu, sd) { 406 for_each_domain(cpu, sd) {
342 enum idle_type itype; 407 enum idle_type itype;
343 char mask_str[NR_CPUS]; 408 char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
356 sd->lb_nobusyq[itype], 421 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 422 sd->lb_nobusyg[itype]);
358 } 423 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 424 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 425 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 426 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
427 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 428 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 429 }
430 preempt_enable();
364#endif 431#endif
365 } 432 }
366 return 0; 433 return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
414 return rq; 481 return rq;
415} 482}
416 483
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS 484#ifdef CONFIG_SCHEDSTATS
434/* 485/*
435 * Called when a process is dequeued from the active array and given 486 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
622 rq->nr_running++; 673 rq->nr_running++;
623} 674}
624 675
625static void recalc_task_prio(task_t *p, unsigned long long now) 676static int recalc_task_prio(task_t *p, unsigned long long now)
626{ 677{
627 /* Caller must always ensure 'now >= p->timestamp' */ 678 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 679 unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
681 } 732 }
682 } 733 }
683 734
684 p->prio = effective_prio(p); 735 return effective_prio(p);
685} 736}
686 737
687/* 738/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
704 } 755 }
705#endif 756#endif
706 757
707 recalc_task_prio(p, now); 758 p->prio = recalc_task_prio(p, now);
708 759
709 /* 760 /*
710 * This checks to make sure it's not an uninterruptible task 761 * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
782} 833}
783 834
784#ifdef CONFIG_SMP 835#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct { 836typedef struct {
791 struct list_head list; 837 struct list_head list;
792 enum request_type type;
793 838
794 /* For REQ_MOVE_TASK */
795 task_t *task; 839 task_t *task;
796 int dest_cpu; 840 int dest_cpu;
797 841
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done; 842 struct completion done;
802} migration_req_t; 843} migration_req_t;
803 844
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
819 } 860 }
820 861
821 init_completion(&req->done); 862 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p; 863 req->task = p;
824 req->dest_cpu = dest_cpu; 864 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 865 list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 926 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 927 * balance conservatively.
888 */ 928 */
889static inline unsigned long source_load(int cpu) 929static inline unsigned long source_load(int cpu, int type)
890{ 930{
891 runqueue_t *rq = cpu_rq(cpu); 931 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 932 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
933 if (type == 0)
934 return load_now;
893 935
894 return min(rq->cpu_load, load_now); 936 return min(rq->cpu_load[type-1], load_now);
895} 937}
896 938
897/* 939/*
898 * Return a high guess at the load of a migration-target cpu 940 * Return a high guess at the load of a migration-target cpu
899 */ 941 */
900static inline unsigned long target_load(int cpu) 942static inline unsigned long target_load(int cpu, int type)
901{ 943{
902 runqueue_t *rq = cpu_rq(cpu); 944 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0)
947 return load_now;
904 948
905 return max(rq->cpu_load, load_now); 949 return max(rq->cpu_load[type-1], load_now);
906} 950}
907 951
908#endif 952/*
953 * find_idlest_group finds and returns the least busy CPU group within the
954 * domain.
955 */
956static struct sched_group *
957find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
958{
959 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
960 unsigned long min_load = ULONG_MAX, this_load = 0;
961 int load_idx = sd->forkexec_idx;
962 int imbalance = 100 + (sd->imbalance_pct-100)/2;
963
964 do {
965 unsigned long load, avg_load;
966 int local_group;
967 int i;
968
969 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971
972 /* Tally up the load of all CPUs in the group */
973 avg_load = 0;
974
975 for_each_cpu_mask(i, group->cpumask) {
976 /* Bias balancing toward cpus of our domain */
977 if (local_group)
978 load = source_load(i, load_idx);
979 else
980 load = target_load(i, load_idx);
981
982 avg_load += load;
983 }
984
985 /* Adjust by relative CPU power of the group */
986 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
987
988 if (local_group) {
989 this_load = avg_load;
990 this = group;
991 } else if (avg_load < min_load) {
992 min_load = avg_load;
993 idlest = group;
994 }
995 group = group->next;
996 } while (group != sd->groups);
997
998 if (!idlest || 100*this_load < imbalance*min_load)
999 return NULL;
1000 return idlest;
1001}
1002
1003/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu)
1007{
1008 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1;
1010 int i;
1011
1012 for_each_cpu_mask(i, group->cpumask) {
1013 load = source_load(i, 0);
1014
1015 if (load < min_load || (load == min_load && i == this_cpu)) {
1016 min_load = load;
1017 idlest = i;
1018 }
1019 }
1020
1021 return idlest;
1022}
1023
1024/*
1025 * sched_balance_self: balance the current task (running on cpu) in domains
1026 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1027 * SD_BALANCE_EXEC.
1028 *
1029 * Balance, ie. select the least loaded group.
1030 *
1031 * Returns the target CPU number, or the same CPU if no balancing is needed.
1032 *
1033 * preempt must be disabled.
1034 */
1035static int sched_balance_self(int cpu, int flag)
1036{
1037 struct task_struct *t = current;
1038 struct sched_domain *tmp, *sd = NULL;
1039
1040 for_each_domain(cpu, tmp)
1041 if (tmp->flags & flag)
1042 sd = tmp;
1043
1044 while (sd) {
1045 cpumask_t span;
1046 struct sched_group *group;
1047 int new_cpu;
1048 int weight;
1049
1050 span = sd->span;
1051 group = find_idlest_group(sd, t, cpu);
1052 if (!group)
1053 goto nextlevel;
1054
1055 new_cpu = find_idlest_cpu(group, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel;
1058
1059 /* Now try balancing at a lower domain level */
1060 cpu = new_cpu;
1061nextlevel:
1062 sd = NULL;
1063 weight = cpus_weight(span);
1064 for_each_domain(cpu, tmp) {
1065 if (weight <= cpus_weight(tmp->span))
1066 break;
1067 if (tmp->flags & flag)
1068 sd = tmp;
1069 }
1070 /* while loop will break here if sd == NULL */
1071 }
1072
1073 return cpu;
1074}
1075
1076#endif /* CONFIG_SMP */
909 1077
910/* 1078/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 1079 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
927 1095
928 for_each_domain(cpu, sd) { 1096 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 1097 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 1098 cpus_and(tmp, sd->span, p->cpus_allowed);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 1099 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 1100 if (idle_cpu(i))
934 return i; 1101 return i;
935 } 1102 }
936 } 1103 }
937 else break; 1104 else
1105 break;
938 } 1106 }
939 return cpu; 1107 return cpu;
940} 1108}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 1135 runqueue_t *rq;
968#ifdef CONFIG_SMP 1136#ifdef CONFIG_SMP
969 unsigned long load, this_load; 1137 unsigned long load, this_load;
970 struct sched_domain *sd; 1138 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 1139 int new_cpu;
972#endif 1140#endif
973 1141
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 1154 if (unlikely(task_running(rq, p)))
987 goto out_activate; 1155 goto out_activate;
988 1156
989#ifdef CONFIG_SCHEDSTATS 1157 new_cpu = cpu;
1158
990 schedstat_inc(rq, ttwu_cnt); 1159 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 1160 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 1161 schedstat_inc(rq, ttwu_local);
993 } else { 1162 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 1163 }
995 if (cpu_isset(cpu, sd->span)) { 1164
996 schedstat_inc(sd, ttwu_wake_remote); 1165 for_each_domain(this_cpu, sd) {
997 break; 1166 if (cpu_isset(cpu, sd->span)) {
998 } 1167 schedstat_inc(sd, ttwu_wake_remote);
1168 this_sd = sd;
1169 break;
999 } 1170 }
1000 } 1171 }
1001#endif
1002 1172
1003 new_cpu = cpu; 1173 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1174 goto out_set_cpu;
1006 1175
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1176 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1177 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1178 */
1014 if (sync) 1179 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1180 int idx = this_sd->wake_idx;
1181 unsigned int imbalance;
1016 1182
1017 /* Don't pull the task off an idle CPU to a busy one */ 1183 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1184
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1185 load = source_load(cpu, idx);
1186 this_load = target_load(this_cpu, idx);
1022 1187
1023 /* 1188 new_cpu = this_cpu; /* Wake to this CPU if we can */
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1189
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1190 if (this_sd->flags & SD_WAKE_AFFINE) {
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1191 unsigned long tl = this_load;
1037 /* 1192 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1193 * If sync wakeup then subtract the (maximum possible)
1039 * in this domain. 1194 * effect of the currently running task from the load
1195 * of the current CPU:
1040 */ 1196 */
1041 if (cpu_isset(cpu, sd->span)) { 1197 if (sync)
1042 schedstat_inc(sd, ttwu_move_affine); 1198 tl -= SCHED_LOAD_SCALE;
1199
1200 if ((tl <= load &&
1201 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1202 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1203 /*
1204 * This domain has SD_WAKE_AFFINE and
1205 * p is cache cold in this domain, and
1206 * there is no bad imbalance.
1207 */
1208 schedstat_inc(this_sd, ttwu_move_affine);
1043 goto out_set_cpu; 1209 goto out_set_cpu;
1044 } 1210 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1211 }
1046 imbalance*this_load <= 100*load) { 1212
1047 /* 1213 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1214 * Start passive balancing when half the imbalance_pct
1049 * an imbalance. 1215 * limit is reached.
1050 */ 1216 */
1051 if (cpu_isset(cpu, sd->span)) { 1217 if (this_sd->flags & SD_WAKE_BALANCE) {
1052 schedstat_inc(sd, ttwu_move_balance); 1218 if (imbalance*this_load <= 100*load) {
1219 schedstat_inc(this_sd, ttwu_move_balance);
1053 goto out_set_cpu; 1220 goto out_set_cpu;
1054 } 1221 }
1055 } 1222 }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1120 return try_to_wake_up(p, state, 0); 1287 return try_to_wake_up(p, state, 0);
1121} 1288}
1122 1289
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/* 1290/*
1129 * Perform scheduler related setup for a newly forked process p. 1291 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1292 * p is forked by current.
1131 */ 1293 */
1132void fastcall sched_fork(task_t *p) 1294void fastcall sched_fork(task_t *p, int clone_flags)
1133{ 1295{
1296 int cpu = get_cpu();
1297
1298#ifdef CONFIG_SMP
1299 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1300#endif
1301 set_task_cpu(p, cpu);
1302
1134 /* 1303 /*
1135 * We mark the process as running here, but have not actually 1304 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1305 * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
1140 p->state = TASK_RUNNING; 1309 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1310 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1311 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS 1312#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1313 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif 1314#endif
1315#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1316 p->oncpu = 0;
1317#endif
1147#ifdef CONFIG_PREEMPT 1318#ifdef CONFIG_PREEMPT
1148 /* 1319 /* Want to start with kernel preemption disabled. */
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1; 1320 p->thread_info->preempt_count = 1;
1155#endif 1321#endif
1156 /* 1322 /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
1174 * runqueue lock is not a problem. 1340 * runqueue lock is not a problem.
1175 */ 1341 */
1176 current->time_slice = 1; 1342 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick(); 1343 scheduler_tick();
1179 local_irq_enable(); 1344 }
1180 preempt_enable(); 1345 local_irq_enable();
1181 } else 1346 put_cpu();
1182 local_irq_enable();
1183} 1347}
1184 1348
1185/* 1349/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1196 runqueue_t *rq, *this_rq; 1360 runqueue_t *rq, *this_rq;
1197 1361
1198 rq = task_rq_lock(p, &flags); 1362 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING); 1363 BUG_ON(p->state != TASK_RUNNING);
1364 this_cpu = smp_processor_id();
1365 cpu = task_cpu(p);
1203 1366
1204 /* 1367 /*
1205 * We decrease the sleep average of forking parents 1368 * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
1296} 1459}
1297 1460
1298/** 1461/**
1462 * prepare_task_switch - prepare to switch tasks
1463 * @rq: the runqueue preparing to switch
1464 * @next: the task we are going to switch to.
1465 *
1466 * This is called with the rq lock held and interrupts off. It must
1467 * be paired with a subsequent finish_task_switch after the context
1468 * switch.
1469 *
1470 * prepare_task_switch sets up locking and calls architecture specific
1471 * hooks.
1472 */
1473static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1474{
1475 prepare_lock_switch(rq, next);
1476 prepare_arch_switch(next);
1477}
1478
1479/**
1299 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1481 * @prev: the thread we just switched away from.
1301 * 1482 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1483 * finish_task_switch must be called after the context switch, paired
1303 * will unlock it along with doing any other architecture-specific cleanup 1484 * with a prepare_task_switch call before the context switch.
1304 * actions. 1485 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1486 * and do any other architecture-specific cleanup actions.
1305 * 1487 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1488 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1489 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1490 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1491 * details.)
1310 */ 1492 */
1311static inline void finish_task_switch(task_t *prev) 1493static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1312 __releases(rq->lock) 1494 __releases(rq->lock)
1313{ 1495{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1496 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1497 unsigned long prev_task_flags;
1317 1498
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
1329 * Manfred Spraul <manfred@colorfullife.com> 1510 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1511 */
1331 prev_task_flags = prev->flags; 1512 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1513 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev);
1333 if (mm) 1515 if (mm)
1334 mmdrop(mm); 1516 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1517 if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
1343asmlinkage void schedule_tail(task_t *prev) 1525asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1526 __releases(rq->lock)
1345{ 1527{
1346 finish_task_switch(prev); 1528 runqueue_t *rq = this_rq();
1347 1529 finish_task_switch(rq, prev);
1530#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1531 /* In this case, finish_task_switch does not reenable preemption */
1532 preempt_enable();
1533#endif
1348 if (current->set_child_tid) 1534 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1535 put_user(current->pid, current->set_child_tid);
1350} 1536}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1494} 1680}
1495 1681
1496/* 1682/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1683 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1684 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1685 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
1571} 1712}
1572 1713
1573/* 1714/*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1715 * sched_exec - execve() is a valuable balancing opportunity, because at
1575 * domain and try to migrate the task to the least loaded CPU. 1716 * this point the task has the smallest effective memory and cache footprint.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1717 */
1580void sched_exec(void) 1718void sched_exec(void)
1581{ 1719{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1720 int new_cpu, this_cpu = get_cpu();
1584 1721 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu(); 1722 put_cpu();
1723 if (new_cpu != this_cpu)
1724 sched_migrate_task(current, new_cpu);
1605} 1725}
1606 1726
1607/* 1727/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1752 */
1633static inline 1753static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1756{
1637 /* 1757 /*
1638 * We do not migrate tasks that are: 1758 * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1760 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1761 * 3) are cache-hot on their current CPU.
1642 */ 1762 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1763 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1764 return 0;
1765 *all_pinned = 0;
1766
1767 if (task_running(rq, p))
1768 return 0;
1647 1769
1648 /* 1770 /*
1649 * Aggressive migration if: 1771 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1772 * 1) task is cache cold, or
1651 * 2) too many balance attempts have failed. 1773 * 2) too many balance attempts have failed.
1652 */ 1774 */
1653 1775
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1776 if (sd->nr_balance_failed > sd->cache_nice_tries)
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1777 return 1;
1657 1778
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1779 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1780 return 0;
1660 return 1; 1781 return 1;
1661} 1782}
1662 1783
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1790 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1791static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1792 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1793 enum idle_type idle, int *all_pinned)
1673{ 1794{
1674 prio_array_t *array, *dst_array; 1795 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1796 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1797 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1798 task_t *tmp;
1678 1799
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1800 if (max_nr_move == 0)
1680 goto out; 1801 goto out;
1681 1802
1803 pinned = 1;
1804
1682 /* 1805 /*
1683 * We first consider expired tasks. Those will likely not be 1806 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1807 * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
1717 1840
1718 curr = curr->prev; 1841 curr = curr->prev;
1719 1842
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1843 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1844 if (curr != head)
1722 goto skip_queue; 1845 goto skip_queue;
1723 idx++; 1846 idx++;
@@ -1746,6 +1869,9 @@ out:
1746 * inside pull_task(). 1869 * inside pull_task().
1747 */ 1870 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1871 schedstat_add(sd, lb_gained[idle], pulled);
1872
1873 if (all_pinned)
1874 *all_pinned = pinned;
1749 return pulled; 1875 return pulled;
1750} 1876}
1751 1877
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1760{ 1886{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1889 int load_idx;
1763 1890
1764 max_load = this_load = total_load = total_pwr = 0; 1891 max_load = this_load = total_load = total_pwr = 0;
1892 if (idle == NOT_IDLE)
1893 load_idx = sd->busy_idx;
1894 else if (idle == NEWLY_IDLE)
1895 load_idx = sd->newidle_idx;
1896 else
1897 load_idx = sd->idle_idx;
1765 1898
1766 do { 1899 do {
1767 unsigned long load; 1900 unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1776 for_each_cpu_mask(i, group->cpumask) { 1909 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1910 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1911 if (local_group)
1779 load = target_load(i); 1912 load = target_load(i, load_idx);
1780 else 1913 else
1781 load = source_load(i); 1914 load = source_load(i, load_idx);
1782 1915
1783 avg_load += load; 1916 avg_load += load;
1784 } 1917 }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1925 if (local_group) {
1793 this_load = avg_load; 1926 this_load = avg_load;
1794 this = group; 1927 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1928 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1929 max_load = avg_load;
1798 busiest = group; 1930 busiest = group;
1799 } 1931 }
1800nextgroup:
1801 group = group->next; 1932 group = group->next;
1802 } while (group != sd->groups); 1933 } while (group != sd->groups);
1803 1934
@@ -1870,15 +2001,9 @@ nextgroup:
1870 2001
1871 /* Get rid of the scaling factor, rounding down as we divide */ 2002 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2003 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest; 2004 return busiest;
1875 2005
1876out_balanced: 2006out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882 2007
1883 *imbalance = 0; 2008 *imbalance = 0;
1884 return NULL; 2009 return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1894 int i; 2019 int i;
1895 2020
1896 for_each_cpu_mask(i, group->cpumask) { 2021 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 2022 load = source_load(i, 0);
1898 2023
1899 if (load > max_load) { 2024 if (load > max_load) {
1900 max_load = load; 2025 max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1906} 2031}
1907 2032
1908/* 2033/*
2034 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2035 * so long as it is large enough.
2036 */
2037#define MAX_PINNED_INTERVAL 512
2038
2039/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2040 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 2041 * tasks if there is an imbalance.
1911 * 2042 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 2048 struct sched_group *group;
1918 runqueue_t *busiest; 2049 runqueue_t *busiest;
1919 unsigned long imbalance; 2050 unsigned long imbalance;
1920 int nr_moved; 2051 int nr_moved, all_pinned = 0;
2052 int active_balance = 0;
1921 2053
1922 spin_lock(&this_rq->lock); 2054 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 2055 schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1934 goto out_balanced; 2066 goto out_balanced;
1935 } 2067 }
1936 2068
1937 /* 2069 BUG_ON(busiest == this_rq);
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946 2070
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 2071 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 2072
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 2080 */
1957 double_lock_balance(this_rq, busiest); 2081 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2082 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 2083 imbalance, sd, idle,
2084 &all_pinned);
1960 spin_unlock(&busiest->lock); 2085 spin_unlock(&busiest->lock);
2086
2087 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned))
2089 goto out_balanced;
1961 } 2090 }
2091
1962 spin_unlock(&this_rq->lock); 2092 spin_unlock(&this_rq->lock);
1963 2093
1964 if (!nr_moved) { 2094 if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 2096 sd->nr_balance_failed++;
1967 2097
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 2099
1971 spin_lock(&busiest->lock); 2100 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 2101 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 2102 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 2103 busiest->push_cpu = this_cpu;
1975 wake = 1; 2104 active_balance = 1;
1976 } 2105 }
1977 spin_unlock(&busiest->lock); 2106 spin_unlock(&busiest->lock);
1978 if (wake) 2107 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 2108 wake_up_process(busiest->migration_thread);
1980 2109
1981 /* 2110 /*
1982 * We've kicked active balancing, reset the failure 2111 * We've kicked active balancing, reset the failure
1983 * counter. 2112 * counter.
1984 */ 2113 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 2114 sd->nr_balance_failed = sd->cache_nice_tries+1;
1986 } 2115 }
1987 2116 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2117 sd->nr_balance_failed = 0;
1996 2118
2119 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2120 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2121 sd->balance_interval = sd->min_interval;
2122 } else {
2123 /*
2124 * If we've begun active balancing, start to back off. This
2125 * case may not be covered by the all_pinned logic if there
2126 * is only 1 task on the busy runqueue (because we don't call
2127 * move_tasks).
2128 */
2129 if (sd->balance_interval < sd->max_interval)
2130 sd->balance_interval *= 2;
1999 } 2131 }
2000 2132
2001 return nr_moved; 2133 return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
2005 2137
2006 schedstat_inc(sd, lb_balanced[idle]); 2138 schedstat_inc(sd, lb_balanced[idle]);
2007 2139
2140 sd->nr_balance_failed = 0;
2008 /* tune up the balancing interval */ 2141 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval))
2010 sd->balance_interval *= 2; 2144 sd->balance_interval *= 2;
2011 2145
2012 return 0; 2146 return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2166 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2168 goto out_balanced;
2036 } 2169 }
2037 2170
2038 busiest = find_busiest_queue(group); 2171 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2172 if (!busiest) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2173 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2174 goto out_balanced;
2043 } 2175 }
2044 2176
2177 BUG_ON(busiest == this_rq);
2178
2045 /* Attempt to move tasks */ 2179 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2180 double_lock_balance(this_rq, busiest);
2047 2181
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2183 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2184 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2185 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else
2188 sd->nr_balance_failed = 0;
2053 2189
2054 spin_unlock(&busiest->lock); 2190 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved; 2191 return nr_moved;
2192
2193out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2195 sd->nr_balance_failed = 0;
2196 return 0;
2058} 2197}
2059 2198
2060/* 2199/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2225static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{ 2226{
2088 struct sched_domain *sd; 2227 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2228 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2229 int target_cpu = busiest_rq->push_cpu;
2092 int cpu; 2230
2231 if (busiest_rq->nr_running <= 1)
2232 /* no task to move */
2233 return;
2234
2235 target_rq = cpu_rq(target_cpu);
2093 2236
2094 /* 2237 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2238 * This condition is "impossible", if it occurs
2096 * domains with SD_LOAD_BALANCE set. 2239 * we need to fix it. Originally reported by
2240 * Bjorn Helgaas on a 128-cpu setup.
2097 */ 2241 */
2098 visited_cpus = CPU_MASK_NONE; 2242 BUG_ON(busiest_rq == target_rq);
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103 2243
2104 schedstat_inc(sd, alb_cnt); 2244 /* move a task from busiest_rq to target_rq */
2245 double_lock_balance(busiest_rq, target_rq);
2105 2246
2106 cpu_group = sd->groups; 2247 /* Search for an sd spanning us and the target CPU. */
2107 do { 2248 for_each_domain(target_cpu, sd)
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2249 if ((sd->flags & SD_LOAD_BALANCE) &&
2109 if (busiest_rq->nr_running <= 1) 2250 cpu_isset(busiest_cpu, sd->span))
2110 /* no more tasks left to move */ 2251 break;
2111 return; 2252
2112 if (cpu_isset(cpu, visited_cpus)) 2253 if (unlikely(sd == NULL))
2113 continue; 2254 goto out;
2114 cpu_set(cpu, visited_cpus); 2255
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2256 schedstat_inc(sd, alb_cnt);
2116 continue; 2257
2117 2258 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2118 target_rq = cpu_rq(cpu); 2259 schedstat_inc(sd, alb_pushed);
2119 /* 2260 else
2120 * This condition is "impossible", if it occurs 2261 schedstat_inc(sd, alb_failed);
2121 * we need to fix it. Originally reported by 2262out:
2122 * Bjorn Helgaas on a 128-cpu setup. 2263 spin_unlock(&target_rq->lock);
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139} 2264}
2140 2265
2141/* 2266/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2156 unsigned long old_load, this_load; 2281 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2282 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2283 struct sched_domain *sd;
2284 int i;
2159 2285
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2286 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2287 /* Update our load */
2164 * Round up the averaging division if load is increasing. This 2288 for (i = 0; i < 3; i++) {
2165 * prevents us from getting stuck on 9 if the load is 10, for 2289 unsigned long new_load = this_load;
2166 * example. 2290 int scale = 1 << i;
2167 */ 2291 old_load = this_rq->cpu_load[i];
2168 if (this_load > old_load) 2292 /*
2169 old_load++; 2293 * Round up the averaging division if load is increasing. This
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2294 * prevents us from getting stuck on 9 if the load is 10, for
2295 * example.
2296 */
2297 if (new_load > old_load)
2298 new_load += scale-1;
2299 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2300 }
2171 2301
2172 for_each_domain(this_cpu, sd) { 2302 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2303 unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
2447#ifdef CONFIG_SCHED_SMT 2577#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{ 2579{
2450 struct sched_domain *sd = this_rq->sd; 2580 struct sched_domain *tmp, *sd = NULL;
2451 cpumask_t sibling_map; 2581 cpumask_t sibling_map;
2452 int i; 2582 int i;
2453 2583
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2584 for_each_domain(this_cpu, tmp)
2585 if (tmp->flags & SD_SHARE_CPUPOWER)
2586 sd = tmp;
2587
2588 if (!sd)
2455 return; 2589 return;
2456 2590
2457 /* 2591 /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2492 2626
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{ 2628{
2495 struct sched_domain *sd = this_rq->sd; 2629 struct sched_domain *tmp, *sd = NULL;
2496 cpumask_t sibling_map; 2630 cpumask_t sibling_map;
2497 prio_array_t *array; 2631 prio_array_t *array;
2498 int ret = 0, i; 2632 int ret = 0, i;
2499 task_t *p; 2633 task_t *p;
2500 2634
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2635 for_each_domain(this_cpu, tmp)
2636 if (tmp->flags & SD_SHARE_CPUPOWER)
2637 sd = tmp;
2638
2639 if (!sd)
2502 return 0; 2640 return 0;
2503 2641
2504 /* 2642 /*
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val)
2576 /* 2714 /*
2577 * Underflow? 2715 * Underflow?
2578 */ 2716 */
2579 BUG_ON(((int)preempt_count() < 0)); 2717 BUG_ON((preempt_count() < 0));
2580 preempt_count() += val; 2718 preempt_count() += val;
2581 /* 2719 /*
2582 * Spinlock count overflowing soon? 2720 * Spinlock count overflowing soon?
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
2613 struct list_head *queue; 2751 struct list_head *queue;
2614 unsigned long long now; 2752 unsigned long long now;
2615 unsigned long run_time; 2753 unsigned long run_time;
2616 int cpu, idx; 2754 int cpu, idx, new_prio;
2617 2755
2618 /* 2756 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2757 * Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2873 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2874
2737 array = next->array; 2875 array = next->array;
2738 dequeue_task(next, array); 2876 new_prio = recalc_task_prio(next, next->timestamp + delta);
2739 recalc_task_prio(next, next->timestamp + delta); 2877
2740 enqueue_task(next, array); 2878 if (unlikely(next->prio != new_prio)) {
2879 dequeue_task(next, array);
2880 next->prio = new_prio;
2881 enqueue_task(next, array);
2882 } else
2883 requeue_task(next, array);
2741 } 2884 }
2742 next->activated = 0; 2885 next->activated = 0;
2743switch_tasks: 2886switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
2761 rq->curr = next; 2904 rq->curr = next;
2762 ++*switch_count; 2905 ++*switch_count;
2763 2906
2764 prepare_arch_switch(rq, next); 2907 prepare_task_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2908 prev = context_switch(rq, prev, next);
2766 barrier(); 2909 barrier();
2767 2910 /*
2768 finish_task_switch(prev); 2911 * this_rq must be evaluated again because prev may have moved
2912 * CPUs since it called schedule(), thus the 'rq' on its stack
2913 * frame will be invalid.
2914 */
2915 finish_task_switch(this_rq(), prev);
2769 } else 2916 } else
2770 spin_unlock_irq(&rq->lock); 2917 spin_unlock_irq(&rq->lock);
2771 2918
@@ -2869,7 +3016,7 @@ need_resched:
2869 3016
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{ 3018{
2872 task_t *p = curr->task; 3019 task_t *p = curr->private;
2873 return try_to_wake_up(p, mode, sync); 3020 return try_to_wake_up(p, mode, sync);
2874} 3021}
2875 3022
@@ -3301,15 +3448,7 @@ int task_nice(const task_t *p)
3301{ 3448{
3302 return TASK_NICE(p); 3449 return TASK_NICE(p);
3303} 3450}
3304
3305/*
3306 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3307 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3308 * Therefore, task_nice is needed if there is a compat_mode.
3309 */
3310#ifdef CONFIG_COMPAT
3311EXPORT_SYMBOL_GPL(task_nice); 3451EXPORT_SYMBOL_GPL(task_nice);
3312#endif
3313 3452
3314/** 3453/**
3315 * idle_cpu - is a given cpu idle currently? 3454 * idle_cpu - is a given cpu idle currently?
@@ -3384,13 +3523,24 @@ recheck:
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3523 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3524 return -EINVAL;
3386 3525
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3526 /*
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3527 * Allow unprivileged RT tasks to decrease priority:
3389 !capable(CAP_SYS_NICE)) 3528 */
3390 return -EPERM; 3529 if (!capable(CAP_SYS_NICE)) {
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3530 /* can't change policy */
3392 !capable(CAP_SYS_NICE)) 3531 if (policy != p->policy)
3393 return -EPERM; 3532 return -EPERM;
3533 /* can't increase priority */
3534 if (policy != SCHED_NORMAL &&
3535 param->sched_priority > p->rt_priority &&
3536 param->sched_priority >
3537 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3538 return -EPERM;
3539 /* can't change other user's priorities */
3540 if ((current->euid != p->euid) &&
3541 (current->euid != p->uid))
3542 return -EPERM;
3543 }
3394 3544
3395 retval = security_task_setscheduler(p, policy, param); 3545 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3546 if (retval)
@@ -3814,7 +3964,7 @@ EXPORT_SYMBOL(yield);
3814 */ 3964 */
3815void __sched io_schedule(void) 3965void __sched io_schedule(void)
3816{ 3966{
3817 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3967 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3818 3968
3819 atomic_inc(&rq->nr_iowait); 3969 atomic_inc(&rq->nr_iowait);
3820 schedule(); 3970 schedule();
@@ -3825,7 +3975,7 @@ EXPORT_SYMBOL(io_schedule);
3825 3975
3826long __sched io_schedule_timeout(long timeout) 3976long __sched io_schedule_timeout(long timeout)
3827{ 3977{
3828 struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); 3978 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
3829 long ret; 3979 long ret;
3830 3980
3831 atomic_inc(&rq->nr_iowait); 3981 atomic_inc(&rq->nr_iowait);
@@ -4016,6 +4166,14 @@ void show_state(void)
4016 read_unlock(&tasklist_lock); 4166 read_unlock(&tasklist_lock);
4017} 4167}
4018 4168
4169/**
4170 * init_idle - set up an idle thread for a given CPU
4171 * @idle: task in question
4172 * @cpu: cpu the idle task belongs to
4173 *
4174 * NOTE: this function does not set the idle thread's NEED_RESCHED
4175 * flag, to make booting more robust.
4176 */
4019void __devinit init_idle(task_t *idle, int cpu) 4177void __devinit init_idle(task_t *idle, int cpu)
4020{ 4178{
4021 runqueue_t *rq = cpu_rq(cpu); 4179 runqueue_t *rq = cpu_rq(cpu);
@@ -4030,7 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4030 4188
4031 spin_lock_irqsave(&rq->lock, flags); 4189 spin_lock_irqsave(&rq->lock, flags);
4032 rq->curr = rq->idle = idle; 4190 rq->curr = rq->idle = idle;
4033 set_tsk_need_resched(idle); 4191#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4192 idle->oncpu = 1;
4193#endif
4034 spin_unlock_irqrestore(&rq->lock, flags); 4194 spin_unlock_irqrestore(&rq->lock, flags);
4035 4195
4036 /* Set the preempt count _outside_ the spinlocks! */ 4196 /* Set the preempt count _outside_ the spinlocks! */
@@ -4174,8 +4334,7 @@ static int migration_thread(void * data)
4174 struct list_head *head; 4334 struct list_head *head;
4175 migration_req_t *req; 4335 migration_req_t *req;
4176 4336
4177 if (current->flags & PF_FREEZE) 4337 try_to_freeze();
4178 refrigerator(PF_FREEZE);
4179 4338
4180 spin_lock_irq(&rq->lock); 4339 spin_lock_irq(&rq->lock);
4181 4340
@@ -4200,17 +4359,9 @@ static int migration_thread(void * data)
4200 req = list_entry(head->next, migration_req_t, list); 4359 req = list_entry(head->next, migration_req_t, list);
4201 list_del_init(head->next); 4360 list_del_init(head->next);
4202 4361
4203 if (req->type == REQ_MOVE_TASK) { 4362 spin_unlock(&rq->lock);
4204 spin_unlock(&rq->lock); 4363 __migrate_task(req->task, cpu, req->dest_cpu);
4205 __migrate_task(req->task, cpu, req->dest_cpu); 4364 local_irq_enable();
4206 local_irq_enable();
4207 } else if (req->type == REQ_SET_DOMAIN) {
4208 rq->sd = req->sd;
4209 spin_unlock_irq(&rq->lock);
4210 } else {
4211 spin_unlock_irq(&rq->lock);
4212 WARN_ON(1);
4213 }
4214 4365
4215 complete(&req->done); 4366 complete(&req->done);
4216 } 4367 }
@@ -4441,7 +4592,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4441 migration_req_t *req; 4592 migration_req_t *req;
4442 req = list_entry(rq->migration_queue.next, 4593 req = list_entry(rq->migration_queue.next,
4443 migration_req_t, list); 4594 migration_req_t, list);
4444 BUG_ON(req->type != REQ_MOVE_TASK);
4445 list_del_init(&req->list); 4595 list_del_init(&req->list);
4446 complete(&req->done); 4596 complete(&req->done);
4447 } 4597 }
@@ -4472,12 +4622,17 @@ int __init migration_init(void)
4472#endif 4622#endif
4473 4623
4474#ifdef CONFIG_SMP 4624#ifdef CONFIG_SMP
4475#define SCHED_DOMAIN_DEBUG 4625#undef SCHED_DOMAIN_DEBUG
4476#ifdef SCHED_DOMAIN_DEBUG 4626#ifdef SCHED_DOMAIN_DEBUG
4477static void sched_domain_debug(struct sched_domain *sd, int cpu) 4627static void sched_domain_debug(struct sched_domain *sd, int cpu)
4478{ 4628{
4479 int level = 0; 4629 int level = 0;
4480 4630
4631 if (!sd) {
4632 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4633 return;
4634 }
4635
4481 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4636 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4482 4637
4483 do { 4638 do {
@@ -4560,37 +4715,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4560#define sched_domain_debug(sd, cpu) {} 4715#define sched_domain_debug(sd, cpu) {}
4561#endif 4716#endif
4562 4717
4718static int sd_degenerate(struct sched_domain *sd)
4719{
4720 if (cpus_weight(sd->span) == 1)
4721 return 1;
4722
4723 /* Following flags need at least 2 groups */
4724 if (sd->flags & (SD_LOAD_BALANCE |
4725 SD_BALANCE_NEWIDLE |
4726 SD_BALANCE_FORK |
4727 SD_BALANCE_EXEC)) {
4728 if (sd->groups != sd->groups->next)
4729 return 0;
4730 }
4731
4732 /* Following flags don't use groups */
4733 if (sd->flags & (SD_WAKE_IDLE |
4734 SD_WAKE_AFFINE |
4735 SD_WAKE_BALANCE))
4736 return 0;
4737
4738 return 1;
4739}
4740
4741static int sd_parent_degenerate(struct sched_domain *sd,
4742 struct sched_domain *parent)
4743{
4744 unsigned long cflags = sd->flags, pflags = parent->flags;
4745
4746 if (sd_degenerate(parent))
4747 return 1;
4748
4749 if (!cpus_equal(sd->span, parent->span))
4750 return 0;
4751
4752 /* Does parent contain flags not in child? */
4753 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4754 if (cflags & SD_WAKE_AFFINE)
4755 pflags &= ~SD_WAKE_BALANCE;
4756 /* Flags needing groups don't count if only 1 group in parent */
4757 if (parent->groups == parent->groups->next) {
4758 pflags &= ~(SD_LOAD_BALANCE |
4759 SD_BALANCE_NEWIDLE |
4760 SD_BALANCE_FORK |
4761 SD_BALANCE_EXEC);
4762 }
4763 if (~cflags & pflags)
4764 return 0;
4765
4766 return 1;
4767}
4768
4563/* 4769/*
4564 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4770 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4565 * hold the hotplug lock. 4771 * hold the hotplug lock.
4566 */ 4772 */
4567void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4773void cpu_attach_domain(struct sched_domain *sd, int cpu)
4568{ 4774{
4569 migration_req_t req;
4570 unsigned long flags;
4571 runqueue_t *rq = cpu_rq(cpu); 4775 runqueue_t *rq = cpu_rq(cpu);
4572 int local = 1; 4776 struct sched_domain *tmp;
4573
4574 sched_domain_debug(sd, cpu);
4575 4777
4576 spin_lock_irqsave(&rq->lock, flags); 4778 /* Remove the sched domains which do not contribute to scheduling. */
4577 4779 for (tmp = sd; tmp; tmp = tmp->parent) {
4578 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4780 struct sched_domain *parent = tmp->parent;
4579 rq->sd = sd; 4781 if (!parent)
4580 } else { 4782 break;
4581 init_completion(&req.done); 4783 if (sd_parent_degenerate(tmp, parent))
4582 req.type = REQ_SET_DOMAIN; 4784 tmp->parent = parent->parent;
4583 req.sd = sd;
4584 list_add(&req.list, &rq->migration_queue);
4585 local = 0;
4586 } 4785 }
4587 4786
4588 spin_unlock_irqrestore(&rq->lock, flags); 4787 if (sd && sd_degenerate(sd))
4788 sd = sd->parent;
4589 4789
4590 if (!local) { 4790 sched_domain_debug(sd, cpu);
4591 wake_up_process(rq->migration_thread); 4791
4592 wait_for_completion(&req.done); 4792 rcu_assign_pointer(rq->sd, sd);
4593 }
4594} 4793}
4595 4794
4596/* cpus with isolated domains */ 4795/* cpus with isolated domains */
@@ -4622,7 +4821,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4622 * covered by the given span, and will set each group's ->cpumask correctly, 4821 * covered by the given span, and will set each group's ->cpumask correctly,
4623 * and ->cpu_power to 0. 4822 * and ->cpu_power to 0.
4624 */ 4823 */
4625void __devinit init_sched_build_groups(struct sched_group groups[], 4824void init_sched_build_groups(struct sched_group groups[],
4626 cpumask_t span, int (*group_fn)(int cpu)) 4825 cpumask_t span, int (*group_fn)(int cpu))
4627{ 4826{
4628 struct sched_group *first = NULL, *last = NULL; 4827 struct sched_group *first = NULL, *last = NULL;
@@ -4658,13 +4857,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4658 4857
4659 4858
4660#ifdef ARCH_HAS_SCHED_DOMAIN 4859#ifdef ARCH_HAS_SCHED_DOMAIN
4661extern void __devinit arch_init_sched_domains(void); 4860extern void build_sched_domains(const cpumask_t *cpu_map);
4662extern void __devinit arch_destroy_sched_domains(void); 4861extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4862extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4663#else 4863#else
4664#ifdef CONFIG_SCHED_SMT 4864#ifdef CONFIG_SCHED_SMT
4665static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4865static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4666static struct sched_group sched_group_cpus[NR_CPUS]; 4866static struct sched_group sched_group_cpus[NR_CPUS];
4667static int __devinit cpu_to_cpu_group(int cpu) 4867static int cpu_to_cpu_group(int cpu)
4668{ 4868{
4669 return cpu; 4869 return cpu;
4670} 4870}
@@ -4672,7 +4872,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4672 4872
4673static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4873static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4674static struct sched_group sched_group_phys[NR_CPUS]; 4874static struct sched_group sched_group_phys[NR_CPUS];
4675static int __devinit cpu_to_phys_group(int cpu) 4875static int cpu_to_phys_group(int cpu)
4676{ 4876{
4677#ifdef CONFIG_SCHED_SMT 4877#ifdef CONFIG_SCHED_SMT
4678 return first_cpu(cpu_sibling_map[cpu]); 4878 return first_cpu(cpu_sibling_map[cpu]);
@@ -4685,7 +4885,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4685 4885
4686static DEFINE_PER_CPU(struct sched_domain, node_domains); 4886static DEFINE_PER_CPU(struct sched_domain, node_domains);
4687static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4887static struct sched_group sched_group_nodes[MAX_NUMNODES];
4688static int __devinit cpu_to_node_group(int cpu) 4888static int cpu_to_node_group(int cpu)
4689{ 4889{
4690 return cpu_to_node(cpu); 4890 return cpu_to_node(cpu);
4691} 4891}
@@ -4716,39 +4916,28 @@ static void check_sibling_maps(void)
4716#endif 4916#endif
4717 4917
4718/* 4918/*
4719 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4919 * Build sched domains for a given set of cpus and attach the sched domains
4920 * to the individual cpus
4720 */ 4921 */
4721static void __devinit arch_init_sched_domains(void) 4922static void build_sched_domains(const cpumask_t *cpu_map)
4722{ 4923{
4723 int i; 4924 int i;
4724 cpumask_t cpu_default_map;
4725
4726#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727 check_sibling_maps();
4728#endif
4729 /*
4730 * Setup mask for cpus without special case scheduling requirements.
4731 * For now this just excludes isolated cpus, but could be used to
4732 * exclude other special cases in the future.
4733 */
4734 cpus_complement(cpu_default_map, cpu_isolated_map);
4735 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4736 4925
4737 /* 4926 /*
4738 * Set up domains. Isolated domains just stay on the dummy domain. 4927 * Set up domains for cpus specified by the cpu_map.
4739 */ 4928 */
4740 for_each_cpu_mask(i, cpu_default_map) { 4929 for_each_cpu_mask(i, *cpu_map) {
4741 int group; 4930 int group;
4742 struct sched_domain *sd = NULL, *p; 4931 struct sched_domain *sd = NULL, *p;
4743 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4932 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4744 4933
4745 cpus_and(nodemask, nodemask, cpu_default_map); 4934 cpus_and(nodemask, nodemask, *cpu_map);
4746 4935
4747#ifdef CONFIG_NUMA 4936#ifdef CONFIG_NUMA
4748 sd = &per_cpu(node_domains, i); 4937 sd = &per_cpu(node_domains, i);
4749 group = cpu_to_node_group(i); 4938 group = cpu_to_node_group(i);
4750 *sd = SD_NODE_INIT; 4939 *sd = SD_NODE_INIT;
4751 sd->span = cpu_default_map; 4940 sd->span = *cpu_map;
4752 sd->groups = &sched_group_nodes[group]; 4941 sd->groups = &sched_group_nodes[group];
4753#endif 4942#endif
4754 4943
@@ -4766,7 +4955,7 @@ static void __devinit arch_init_sched_domains(void)
4766 group = cpu_to_cpu_group(i); 4955 group = cpu_to_cpu_group(i);
4767 *sd = SD_SIBLING_INIT; 4956 *sd = SD_SIBLING_INIT;
4768 sd->span = cpu_sibling_map[i]; 4957 sd->span = cpu_sibling_map[i];
4769 cpus_and(sd->span, sd->span, cpu_default_map); 4958 cpus_and(sd->span, sd->span, *cpu_map);
4770 sd->parent = p; 4959 sd->parent = p;
4771 sd->groups = &sched_group_cpus[group]; 4960 sd->groups = &sched_group_cpus[group];
4772#endif 4961#endif
@@ -4776,7 +4965,7 @@ static void __devinit arch_init_sched_domains(void)
4776 /* Set up CPU (sibling) groups */ 4965 /* Set up CPU (sibling) groups */
4777 for_each_online_cpu(i) { 4966 for_each_online_cpu(i) {
4778 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4967 cpumask_t this_sibling_map = cpu_sibling_map[i];
4779 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4968 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4780 if (i != first_cpu(this_sibling_map)) 4969 if (i != first_cpu(this_sibling_map))
4781 continue; 4970 continue;
4782 4971
@@ -4789,7 +4978,7 @@ static void __devinit arch_init_sched_domains(void)
4789 for (i = 0; i < MAX_NUMNODES; i++) { 4978 for (i = 0; i < MAX_NUMNODES; i++) {
4790 cpumask_t nodemask = node_to_cpumask(i); 4979 cpumask_t nodemask = node_to_cpumask(i);
4791 4980
4792 cpus_and(nodemask, nodemask, cpu_default_map); 4981 cpus_and(nodemask, nodemask, *cpu_map);
4793 if (cpus_empty(nodemask)) 4982 if (cpus_empty(nodemask))
4794 continue; 4983 continue;
4795 4984
@@ -4799,12 +4988,12 @@ static void __devinit arch_init_sched_domains(void)
4799 4988
4800#ifdef CONFIG_NUMA 4989#ifdef CONFIG_NUMA
4801 /* Set up node groups */ 4990 /* Set up node groups */
4802 init_sched_build_groups(sched_group_nodes, cpu_default_map, 4991 init_sched_build_groups(sched_group_nodes, *cpu_map,
4803 &cpu_to_node_group); 4992 &cpu_to_node_group);
4804#endif 4993#endif
4805 4994
4806 /* Calculate CPU power for physical packages and nodes */ 4995 /* Calculate CPU power for physical packages and nodes */
4807 for_each_cpu_mask(i, cpu_default_map) { 4996 for_each_cpu_mask(i, *cpu_map) {
4808 int power; 4997 int power;
4809 struct sched_domain *sd; 4998 struct sched_domain *sd;
4810#ifdef CONFIG_SCHED_SMT 4999#ifdef CONFIG_SCHED_SMT
@@ -4828,7 +5017,7 @@ static void __devinit arch_init_sched_domains(void)
4828 } 5017 }
4829 5018
4830 /* Attach the domains */ 5019 /* Attach the domains */
4831 for_each_online_cpu(i) { 5020 for_each_cpu_mask(i, *cpu_map) {
4832 struct sched_domain *sd; 5021 struct sched_domain *sd;
4833#ifdef CONFIG_SCHED_SMT 5022#ifdef CONFIG_SCHED_SMT
4834 sd = &per_cpu(cpu_domains, i); 5023 sd = &per_cpu(cpu_domains, i);
@@ -4838,41 +5027,85 @@ static void __devinit arch_init_sched_domains(void)
4838 cpu_attach_domain(sd, i); 5027 cpu_attach_domain(sd, i);
4839 } 5028 }
4840} 5029}
5030/*
5031 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5032 */
5033static void arch_init_sched_domains(cpumask_t *cpu_map)
5034{
5035 cpumask_t cpu_default_map;
5036
5037#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5038 check_sibling_maps();
5039#endif
5040 /*
5041 * Setup mask for cpus without special case scheduling requirements.
5042 * For now this just excludes isolated cpus, but could be used to
5043 * exclude other special cases in the future.
5044 */
5045 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5046
5047 build_sched_domains(&cpu_default_map);
5048}
4841 5049
4842#ifdef CONFIG_HOTPLUG_CPU 5050static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
4843static void __devinit arch_destroy_sched_domains(void)
4844{ 5051{
4845 /* Do nothing: everything is statically allocated. */ 5052 /* Do nothing: everything is statically allocated. */
4846} 5053}
4847#endif
4848 5054
4849#endif /* ARCH_HAS_SCHED_DOMAIN */ 5055#endif /* ARCH_HAS_SCHED_DOMAIN */
4850 5056
4851/* 5057/*
4852 * Initial dummy domain for early boot and for hotplug cpu. Being static, 5058 * Detach sched domains from a group of cpus specified in cpu_map
4853 * it is initialized to zero, so all balancing flags are cleared which is 5059 * These cpus will now be attached to the NULL domain
4854 * what we want.
4855 */ 5060 */
4856static struct sched_domain sched_domain_dummy; 5061static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5062{
5063 int i;
5064
5065 for_each_cpu_mask(i, *cpu_map)
5066 cpu_attach_domain(NULL, i);
5067 synchronize_sched();
5068 arch_destroy_sched_domains(cpu_map);
5069}
5070
5071/*
5072 * Partition sched domains as specified by the cpumasks below.
5073 * This attaches all cpus from the cpumasks to the NULL domain,
5074 * waits for a RCU quiescent period, recalculates sched
5075 * domain information and then attaches them back to the
5076 * correct sched domains
5077 * Call with hotplug lock held
5078 */
5079void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5080{
5081 cpumask_t change_map;
5082
5083 cpus_and(*partition1, *partition1, cpu_online_map);
5084 cpus_and(*partition2, *partition2, cpu_online_map);
5085 cpus_or(change_map, *partition1, *partition2);
5086
5087 /* Detach sched domains from all of the affected cpus */
5088 detach_destroy_domains(&change_map);
5089 if (!cpus_empty(*partition1))
5090 build_sched_domains(partition1);
5091 if (!cpus_empty(*partition2))
5092 build_sched_domains(partition2);
5093}
4857 5094
4858#ifdef CONFIG_HOTPLUG_CPU 5095#ifdef CONFIG_HOTPLUG_CPU
4859/* 5096/*
4860 * Force a reinitialization of the sched domains hierarchy. The domains 5097 * Force a reinitialization of the sched domains hierarchy. The domains
4861 * and groups cannot be updated in place without racing with the balancing 5098 * and groups cannot be updated in place without racing with the balancing
4862 * code, so we temporarily attach all running cpus to a "dummy" domain 5099 * code, so we temporarily attach all running cpus to the NULL domain
4863 * which will prevent rebalancing while the sched domains are recalculated. 5100 * which will prevent rebalancing while the sched domains are recalculated.
4864 */ 5101 */
4865static int update_sched_domains(struct notifier_block *nfb, 5102static int update_sched_domains(struct notifier_block *nfb,
4866 unsigned long action, void *hcpu) 5103 unsigned long action, void *hcpu)
4867{ 5104{
4868 int i;
4869
4870 switch (action) { 5105 switch (action) {
4871 case CPU_UP_PREPARE: 5106 case CPU_UP_PREPARE:
4872 case CPU_DOWN_PREPARE: 5107 case CPU_DOWN_PREPARE:
4873 for_each_online_cpu(i) 5108 detach_destroy_domains(&cpu_online_map);
4874 cpu_attach_domain(&sched_domain_dummy, i);
4875 arch_destroy_sched_domains();
4876 return NOTIFY_OK; 5109 return NOTIFY_OK;
4877 5110
4878 case CPU_UP_CANCELED: 5111 case CPU_UP_CANCELED:
@@ -4888,7 +5121,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4888 } 5121 }
4889 5122
4890 /* The hotplug lock is already held by cpu_up/cpu_down */ 5123 /* The hotplug lock is already held by cpu_up/cpu_down */
4891 arch_init_sched_domains(); 5124 arch_init_sched_domains(&cpu_online_map);
4892 5125
4893 return NOTIFY_OK; 5126 return NOTIFY_OK;
4894} 5127}
@@ -4897,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4897void __init sched_init_smp(void) 5130void __init sched_init_smp(void)
4898{ 5131{
4899 lock_cpu_hotplug(); 5132 lock_cpu_hotplug();
4900 arch_init_sched_domains(); 5133 arch_init_sched_domains(&cpu_online_map);
4901 unlock_cpu_hotplug(); 5134 unlock_cpu_hotplug();
4902 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5135 /* XXX: Theoretical race here - CPU may be hotplugged now */
4903 hotcpu_notifier(update_sched_domains, 0); 5136 hotcpu_notifier(update_sched_domains, 0);
@@ -4927,13 +5160,15 @@ void __init sched_init(void)
4927 5160
4928 rq = cpu_rq(i); 5161 rq = cpu_rq(i);
4929 spin_lock_init(&rq->lock); 5162 spin_lock_init(&rq->lock);
5163 rq->nr_running = 0;
4930 rq->active = rq->arrays; 5164 rq->active = rq->arrays;
4931 rq->expired = rq->arrays + 1; 5165 rq->expired = rq->arrays + 1;
4932 rq->best_expired_prio = MAX_PRIO; 5166 rq->best_expired_prio = MAX_PRIO;
4933 5167
4934#ifdef CONFIG_SMP 5168#ifdef CONFIG_SMP
4935 rq->sd = &sched_domain_dummy; 5169 rq->sd = NULL;
4936 rq->cpu_load = 0; 5170 for (j = 1; j < 3; j++)
5171 rq->cpu_load[j] = 0;
4937 rq->active_balance = 0; 5172 rq->active_balance = 0;
4938 rq->push_cpu = 0; 5173 rq->push_cpu = 0;
4939 rq->migration_thread = NULL; 5174 rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index c89821b69ae3..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
213fastcall void recalc_sigpending_tsk(struct task_struct *t) 213fastcall void recalc_sigpending_tsk(struct task_struct *t)
214{ 214{
215 if (t->signal->group_stop_count > 0 || 215 if (t->signal->group_stop_count > 0 ||
216 (freezing(t)) ||
216 PENDING(&t->pending, &t->blocked) || 217 PENDING(&t->pending, &t->blocked) ||
217 PENDING(&t->signal->shared_pending, &t->blocked)) 218 PENDING(&t->signal->shared_pending, &t->blocked))
218 set_tsk_thread_flag(t, TIF_SIGPENDING); 219 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -2230,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2230 current->state = TASK_INTERRUPTIBLE; 2231 current->state = TASK_INTERRUPTIBLE;
2231 timeout = schedule_timeout(timeout); 2232 timeout = schedule_timeout(timeout);
2232 2233
2233 if (current->flags & PF_FREEZE) 2234 try_to_freeze();
2234 refrigerator(PF_FREEZE);
2235 spin_lock_irq(&current->sighand->siglock); 2235 spin_lock_irq(&current->sighand->siglock);
2236 sig = dequeue_signal(current, &these, &info); 2236 sig = dequeue_signal(current, &these, &info);
2237 current->blocked = current->real_blocked; 2237 current->blocked = current->real_blocked;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7cf..84a9d18aa8da 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
100 stopmachine_state = STOPMACHINE_WAIT; 100 stopmachine_state = STOPMACHINE_WAIT;
101 101
102 for_each_online_cpu(i) { 102 for_each_online_cpu(i) {
103 if (i == _smp_processor_id()) 103 if (i == raw_smp_processor_id())
104 continue; 104 continue;
105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 105 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
106 if (ret < 0) 106 if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
182 182
183 /* If they don't care which CPU fn runs on, bind to any online one. */ 183 /* If they don't care which CPU fn runs on, bind to any online one. */
184 if (cpu == NR_CPUS) 184 if (cpu == NR_CPUS)
185 cpu = _smp_processor_id(); 185 cpu = raw_smp_processor_id();
186 186
187 p = kthread_create(do_stop, &smdata, "kstopmachine"); 187 p = kthread_create(do_stop, &smdata, "kstopmachine");
188 if (!IS_ERR(p)) { 188 if (!IS_ERR(p)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/highuid.h> 17#include <linux/highuid.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/kexec.h>
19#include <linux/workqueue.h> 21#include <linux/workqueue.h>
20#include <linux/device.h> 22#include <linux/device.h>
21#include <linux/key.h> 23#include <linux/key.h>
@@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
405 case LINUX_REBOOT_CMD_HALT: 407 case LINUX_REBOOT_CMD_HALT:
406 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 408 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
407 system_state = SYSTEM_HALT; 409 system_state = SYSTEM_HALT;
410 device_suspend(PMSG_SUSPEND);
408 device_shutdown(); 411 device_shutdown();
409 printk(KERN_EMERG "System halted.\n"); 412 printk(KERN_EMERG "System halted.\n");
410 machine_halt(); 413 machine_halt();
@@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
415 case LINUX_REBOOT_CMD_POWER_OFF: 418 case LINUX_REBOOT_CMD_POWER_OFF:
416 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 419 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
417 system_state = SYSTEM_POWER_OFF; 420 system_state = SYSTEM_POWER_OFF;
421 device_suspend(PMSG_SUSPEND);
418 device_shutdown(); 422 device_shutdown();
419 printk(KERN_EMERG "Power down.\n"); 423 printk(KERN_EMERG "Power down.\n");
420 machine_power_off(); 424 machine_power_off();
@@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
431 435
432 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); 436 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
433 system_state = SYSTEM_RESTART; 437 system_state = SYSTEM_RESTART;
438 device_suspend(PMSG_FREEZE);
434 device_shutdown(); 439 device_shutdown();
435 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); 440 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
436 machine_restart(buffer); 441 machine_restart(buffer);
437 break; 442 break;
438 443
444#ifdef CONFIG_KEXEC
445 case LINUX_REBOOT_CMD_KEXEC:
446 {
447 struct kimage *image;
448 image = xchg(&kexec_image, 0);
449 if (!image) {
450 unlock_kernel();
451 return -EINVAL;
452 }
453 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
454 system_state = SYSTEM_RESTART;
455 device_shutdown();
456 printk(KERN_EMERG "Starting new kernel\n");
457 machine_shutdown();
458 machine_kexec(image);
459 break;
460 }
461#endif
439#ifdef CONFIG_SOFTWARE_SUSPEND 462#ifdef CONFIG_SOFTWARE_SUSPEND
440 case LINUX_REBOOT_CMD_SW_SUSPEND: 463 case LINUX_REBOOT_CMD_SW_SUSPEND:
441 { 464 {
@@ -525,7 +548,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
525 } 548 }
526 if (new_egid != old_egid) 549 if (new_egid != old_egid)
527 { 550 {
528 current->mm->dumpable = 0; 551 current->mm->dumpable = suid_dumpable;
529 smp_wmb(); 552 smp_wmb();
530 } 553 }
531 if (rgid != (gid_t) -1 || 554 if (rgid != (gid_t) -1 ||
@@ -556,7 +579,7 @@ asmlinkage long sys_setgid(gid_t gid)
556 { 579 {
557 if(old_egid != gid) 580 if(old_egid != gid)
558 { 581 {
559 current->mm->dumpable=0; 582 current->mm->dumpable = suid_dumpable;
560 smp_wmb(); 583 smp_wmb();
561 } 584 }
562 current->gid = current->egid = current->sgid = current->fsgid = gid; 585 current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +588,7 @@ asmlinkage long sys_setgid(gid_t gid)
565 { 588 {
566 if(old_egid != gid) 589 if(old_egid != gid)
567 { 590 {
568 current->mm->dumpable=0; 591 current->mm->dumpable = suid_dumpable;
569 smp_wmb(); 592 smp_wmb();
570 } 593 }
571 current->egid = current->fsgid = gid; 594 current->egid = current->fsgid = gid;
@@ -596,7 +619,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
596 619
597 if(dumpclear) 620 if(dumpclear)
598 { 621 {
599 current->mm->dumpable = 0; 622 current->mm->dumpable = suid_dumpable;
600 smp_wmb(); 623 smp_wmb();
601 } 624 }
602 current->uid = new_ruid; 625 current->uid = new_ruid;
@@ -653,7 +676,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 676
654 if (new_euid != old_euid) 677 if (new_euid != old_euid)
655 { 678 {
656 current->mm->dumpable=0; 679 current->mm->dumpable = suid_dumpable;
657 smp_wmb(); 680 smp_wmb();
658 } 681 }
659 current->fsuid = current->euid = new_euid; 682 current->fsuid = current->euid = new_euid;
@@ -703,7 +726,7 @@ asmlinkage long sys_setuid(uid_t uid)
703 726
704 if (old_euid != uid) 727 if (old_euid != uid)
705 { 728 {
706 current->mm->dumpable = 0; 729 current->mm->dumpable = suid_dumpable;
707 smp_wmb(); 730 smp_wmb();
708 } 731 }
709 current->fsuid = current->euid = uid; 732 current->fsuid = current->euid = uid;
@@ -748,7 +771,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
748 if (euid != (uid_t) -1) { 771 if (euid != (uid_t) -1) {
749 if (euid != current->euid) 772 if (euid != current->euid)
750 { 773 {
751 current->mm->dumpable = 0; 774 current->mm->dumpable = suid_dumpable;
752 smp_wmb(); 775 smp_wmb();
753 } 776 }
754 current->euid = euid; 777 current->euid = euid;
@@ -798,7 +821,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
798 if (egid != (gid_t) -1) { 821 if (egid != (gid_t) -1) {
799 if (egid != current->egid) 822 if (egid != current->egid)
800 { 823 {
801 current->mm->dumpable = 0; 824 current->mm->dumpable = suid_dumpable;
802 smp_wmb(); 825 smp_wmb();
803 } 826 }
804 current->egid = egid; 827 current->egid = egid;
@@ -845,7 +868,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
845 { 868 {
846 if (uid != old_fsuid) 869 if (uid != old_fsuid)
847 { 870 {
848 current->mm->dumpable = 0; 871 current->mm->dumpable = suid_dumpable;
849 smp_wmb(); 872 smp_wmb();
850 } 873 }
851 current->fsuid = uid; 874 current->fsuid = uid;
@@ -875,7 +898,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
875 { 898 {
876 if (gid != old_fsgid) 899 if (gid != old_fsgid)
877 { 900 {
878 current->mm->dumpable = 0; 901 current->mm->dumpable = suid_dumpable;
879 smp_wmb(); 902 smp_wmb();
880 } 903 }
881 current->fsgid = gid; 904 current->fsgid = gid;
@@ -894,35 +917,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
894 */ 917 */
895 if (tbuf) { 918 if (tbuf) {
896 struct tms tmp; 919 struct tms tmp;
897 struct task_struct *tsk = current;
898 struct task_struct *t;
899 cputime_t utime, stime, cutime, cstime; 920 cputime_t utime, stime, cutime, cstime;
900 921
901 read_lock(&tasklist_lock); 922#ifdef CONFIG_SMP
902 utime = tsk->signal->utime; 923 if (thread_group_empty(current)) {
903 stime = tsk->signal->stime; 924 /*
904 t = tsk; 925 * Single thread case without the use of any locks.
905 do { 926 *
906 utime = cputime_add(utime, t->utime); 927 * We may race with release_task if two threads are
907 stime = cputime_add(stime, t->stime); 928 * executing. However, release task first adds up the
908 t = next_thread(t); 929 * counters (__exit_signal) before removing the task
909 } while (t != tsk); 930 * from the process tasklist (__unhash_process).
910 931 * __exit_signal also acquires and releases the
911 /* 932 * siglock which results in the proper memory ordering
912 * While we have tasklist_lock read-locked, no dying thread 933 * so that the list modifications are always visible
913 * can be updating current->signal->[us]time. Instead, 934 * after the counters have been updated.
914 * we got their counts included in the live thread loop. 935 *
915 * However, another thread can come in right now and 936 * If the counters have been updated by the second thread
916 * do a wait call that updates current->signal->c[us]time. 937 * but the thread has not yet been removed from the list
917 * To make sure we always see that pair updated atomically, 938 * then the other branch will be executing which will
918 * we take the siglock around fetching them. 939 * block on tasklist_lock until the exit handling of the
919 */ 940 * other task is finished.
920 spin_lock_irq(&tsk->sighand->siglock); 941 *
921 cutime = tsk->signal->cutime; 942 * This also implies that the sighand->siglock cannot
922 cstime = tsk->signal->cstime; 943 * be held by another processor. So we can also
923 spin_unlock_irq(&tsk->sighand->siglock); 944 * skip acquiring that lock.
924 read_unlock(&tasklist_lock); 945 */
946 utime = cputime_add(current->signal->utime, current->utime);
947 stime = cputime_add(current->signal->utime, current->stime);
948 cutime = current->signal->cutime;
949 cstime = current->signal->cstime;
950 } else
951#endif
952 {
925 953
954 /* Process with multiple threads */
955 struct task_struct *tsk = current;
956 struct task_struct *t;
957
958 read_lock(&tasklist_lock);
959 utime = tsk->signal->utime;
960 stime = tsk->signal->stime;
961 t = tsk;
962 do {
963 utime = cputime_add(utime, t->utime);
964 stime = cputime_add(stime, t->stime);
965 t = next_thread(t);
966 } while (t != tsk);
967
968 /*
969 * While we have tasklist_lock read-locked, no dying thread
970 * can be updating current->signal->[us]time. Instead,
971 * we got their counts included in the live thread loop.
972 * However, another thread can come in right now and
973 * do a wait call that updates current->signal->c[us]time.
974 * To make sure we always see that pair updated atomically,
975 * we take the siglock around fetching them.
976 */
977 spin_lock_irq(&tsk->sighand->siglock);
978 cutime = tsk->signal->cutime;
979 cstime = tsk->signal->cstime;
980 spin_unlock_irq(&tsk->sighand->siglock);
981 read_unlock(&tasklist_lock);
982 }
926 tmp.tms_utime = cputime_to_clock_t(utime); 983 tmp.tms_utime = cputime_to_clock_t(utime);
927 tmp.tms_stime = cputime_to_clock_t(stime); 984 tmp.tms_stime = cputime_to_clock_t(stime);
928 tmp.tms_cutime = cputime_to_clock_t(cutime); 985 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1225,7 +1282,7 @@ static void groups_sort(struct group_info *group_info)
1225} 1282}
1226 1283
1227/* a simple bsearch */ 1284/* a simple bsearch */
1228static int groups_search(struct group_info *group_info, gid_t grp) 1285int groups_search(struct group_info *group_info, gid_t grp)
1229{ 1286{
1230 int left, right; 1287 int left, right;
1231 1288
@@ -1652,7 +1709,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1652 error = 1; 1709 error = 1;
1653 break; 1710 break;
1654 case PR_SET_DUMPABLE: 1711 case PR_SET_DUMPABLE:
1655 if (arg2 != 0 && arg2 != 1) { 1712 if (arg2 < 0 || arg2 > 2) {
1656 error = -EINVAL; 1713 error = -EINVAL;
1657 break; 1714 break;
1658 } 1715 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70ed1f98..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff); 20cond_syscall(sys_swapoff);
21cond_syscall(sys_kexec_load);
22cond_syscall(compat_sys_kexec_load);
21cond_syscall(sys_init_module); 23cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module); 24cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair); 25cond_syscall(sys_socketpair);
@@ -77,6 +79,7 @@ cond_syscall(sys_request_key);
77cond_syscall(sys_keyctl); 79cond_syscall(sys_keyctl);
78cond_syscall(compat_sys_keyctl); 80cond_syscall(compat_sys_keyctl);
79cond_syscall(compat_sys_socketcall); 81cond_syscall(compat_sys_socketcall);
82cond_syscall(sys_set_zone_reclaim);
80 83
81/* arch-specific weak syscall entries */ 84/* arch-specific weak syscall entries */
82cond_syscall(sys_pciconfig_read); 85cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
58extern int max_threads; 58extern int max_threads;
59extern int sysrq_enabled; 59extern int sysrq_enabled;
60extern int core_uses_pid; 60extern int core_uses_pid;
61extern int suid_dumpable;
61extern char core_pattern[]; 62extern char core_pattern[];
62extern int cad_pid; 63extern int cad_pid;
63extern int pid_max; 64extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
950 .proc_handler = &proc_dointvec, 951 .proc_handler = &proc_dointvec,
951 }, 952 },
952#endif 953#endif
954 {
955 .ctl_name = KERN_SETUID_DUMPABLE,
956 .procname = "suid_dumpable",
957 .data = &suid_dumpable,
958 .maxlen = sizeof(int),
959 .mode = 0644,
960 .proc_handler = &proc_dointvec,
961 },
953 { .ctl_name = 0 } 962 { .ctl_name = 0 }
954}; 963};
955 964
@@ -991,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
991 int error = parse_table(name, nlen, oldval, oldlenp, 1000 int error = parse_table(name, nlen, oldval, oldlenp,
992 newval, newlen, head->ctl_table, 1001 newval, newlen, head->ctl_table,
993 &context); 1002 &context);
994 if (context) 1003 kfree(context);
995 kfree(context);
996 if (error != -ENOTDIR) 1004 if (error != -ENOTDIR)
997 return error; 1005 return error;
998 tmp = tmp->next; 1006 tmp = tmp->next;
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
57#define TVN_MASK (TVN_SIZE - 1) 57#define TVN_MASK (TVN_SIZE - 1)
58#define TVR_MASK (TVR_SIZE - 1) 58#define TVR_MASK (TVR_SIZE - 1)
59 59
60struct timer_base_s {
61 spinlock_t lock;
62 struct timer_list *running_timer;
63};
64
60typedef struct tvec_s { 65typedef struct tvec_s {
61 struct list_head vec[TVN_SIZE]; 66 struct list_head vec[TVN_SIZE];
62} tvec_t; 67} tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
66} tvec_root_t; 71} tvec_root_t;
67 72
68struct tvec_t_base_s { 73struct tvec_t_base_s {
69 spinlock_t lock; 74 struct timer_base_s t_base;
70 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
71 struct timer_list *running_timer;
72 tvec_root_t tv1; 76 tvec_root_t tv1;
73 tvec_t tv2; 77 tvec_t tv2;
74 tvec_t tv3; 78 tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
77} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
78 82
79typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
84static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
80 85
81static inline void set_running_timer(tvec_base_t *base, 86static inline void set_running_timer(tvec_base_t *base,
82 struct timer_list *timer) 87 struct timer_list *timer)
83{ 88{
84#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
85 base->running_timer = timer; 90 base->t_base.running_timer = timer;
86#endif 91#endif
87} 92}
88 93
89/* Fake initialization */
90static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
91
92static void check_timer_failed(struct timer_list *timer) 94static void check_timer_failed(struct timer_list *timer)
93{ 95{
94 static int whine_count; 96 static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
103 /* 105 /*
104 * Now fix it up 106 * Now fix it up
105 */ 107 */
106 spin_lock_init(&timer->lock);
107 timer->magic = TIMER_MAGIC; 108 timer->magic = TIMER_MAGIC;
108} 109}
109 110
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
156 list_add_tail(&timer->entry, vec); 157 list_add_tail(&timer->entry, vec);
157} 158}
158 159
160typedef struct timer_base_s timer_base_t;
161/*
162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
163 * at compile time, and we need timer->base to lock the timer.
164 */
165timer_base_t __init_timer_base
166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
167EXPORT_SYMBOL(__init_timer_base);
168
169/***
170 * init_timer - initialize a timer.
171 * @timer: the timer to be initialized
172 *
173 * init_timer() must be done to a timer prior calling *any* of the
174 * other timer functions.
175 */
176void fastcall init_timer(struct timer_list *timer)
177{
178 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181}
182EXPORT_SYMBOL(init_timer);
183
184static inline void detach_timer(struct timer_list *timer,
185 int clear_pending)
186{
187 struct list_head *entry = &timer->entry;
188
189 __list_del(entry->prev, entry->next);
190 if (clear_pending)
191 entry->next = NULL;
192 entry->prev = LIST_POISON2;
193}
194
195/*
196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
197 * means that all timers which are tied to this base via timer->base are
198 * locked, and the base itself is locked too.
199 *
200 * So __run_timers/migrate_timers can safely modify all timers which could
201 * be found on ->tvX lists.
202 *
203 * When the timer's base is locked, and the timer removed from list, it is
204 * possible to set timer->base = NULL and drop the lock: the timer remains
205 * locked.
206 */
207static timer_base_t *lock_timer_base(struct timer_list *timer,
208 unsigned long *flags)
209{
210 timer_base_t *base;
211
212 for (;;) {
213 base = timer->base;
214 if (likely(base != NULL)) {
215 spin_lock_irqsave(&base->lock, *flags);
216 if (likely(base == timer->base))
217 return base;
218 /* The timer has migrated to another CPU */
219 spin_unlock_irqrestore(&base->lock, *flags);
220 }
221 cpu_relax();
222 }
223}
224
159int __mod_timer(struct timer_list *timer, unsigned long expires) 225int __mod_timer(struct timer_list *timer, unsigned long expires)
160{ 226{
161 tvec_base_t *old_base, *new_base; 227 timer_base_t *base;
228 tvec_base_t *new_base;
162 unsigned long flags; 229 unsigned long flags;
163 int ret = 0; 230 int ret = 0;
164 231
165 BUG_ON(!timer->function); 232 BUG_ON(!timer->function);
166
167 check_timer(timer); 233 check_timer(timer);
168 234
169 spin_lock_irqsave(&timer->lock, flags); 235 base = lock_timer_base(timer, &flags);
236
237 if (timer_pending(timer)) {
238 detach_timer(timer, 0);
239 ret = 1;
240 }
241
170 new_base = &__get_cpu_var(tvec_bases); 242 new_base = &__get_cpu_var(tvec_bases);
171repeat:
172 old_base = timer->base;
173 243
174 /* 244 if (base != &new_base->t_base) {
175 * Prevent deadlocks via ordering by old_base < new_base.
176 */
177 if (old_base && (new_base != old_base)) {
178 if (old_base < new_base) {
179 spin_lock(&new_base->lock);
180 spin_lock(&old_base->lock);
181 } else {
182 spin_lock(&old_base->lock);
183 spin_lock(&new_base->lock);
184 }
185 /* 245 /*
186 * The timer base might have been cancelled while we were 246 * We are trying to schedule the timer on the local CPU.
187 * trying to take the lock(s): 247 * However we can't change timer's base while it is running,
248 * otherwise del_timer_sync() can't detect that the timer's
249 * handler yet has not finished. This also guarantees that
250 * the timer is serialized wrt itself.
188 */ 251 */
189 if (timer->base != old_base) { 252 if (unlikely(base->running_timer == timer)) {
190 spin_unlock(&new_base->lock); 253 /* The timer remains on a former base */
191 spin_unlock(&old_base->lock); 254 new_base = container_of(base, tvec_base_t, t_base);
192 goto repeat; 255 } else {
193 } 256 /* See the comment in lock_timer_base() */
194 } else { 257 timer->base = NULL;
195 spin_lock(&new_base->lock); 258 spin_unlock(&base->lock);
196 if (timer->base != old_base) { 259 spin_lock(&new_base->t_base.lock);
197 spin_unlock(&new_base->lock); 260 timer->base = &new_base->t_base;
198 goto repeat;
199 } 261 }
200 } 262 }
201 263
202 /*
203 * Delete the previous timeout (if there was any), and install
204 * the new one:
205 */
206 if (old_base) {
207 list_del(&timer->entry);
208 ret = 1;
209 }
210 timer->expires = expires; 264 timer->expires = expires;
211 internal_add_timer(new_base, timer); 265 internal_add_timer(new_base, timer);
212 timer->base = new_base; 266 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
213
214 if (old_base && (new_base != old_base))
215 spin_unlock(&old_base->lock);
216 spin_unlock(&new_base->lock);
217 spin_unlock_irqrestore(&timer->lock, flags);
218 267
219 return ret; 268 return ret;
220} 269}
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
232{ 281{
233 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 282 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
234 unsigned long flags; 283 unsigned long flags;
235 284
236 BUG_ON(timer_pending(timer) || !timer->function); 285 BUG_ON(timer_pending(timer) || !timer->function);
237 286
238 check_timer(timer); 287 check_timer(timer);
239 288
240 spin_lock_irqsave(&base->lock, flags); 289 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base;
241 internal_add_timer(base, timer); 291 internal_add_timer(base, timer);
242 timer->base = base; 292 spin_unlock_irqrestore(&base->t_base.lock, flags);
243 spin_unlock_irqrestore(&base->lock, flags);
244} 293}
245 294
246 295
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer);
295 */ 344 */
296int del_timer(struct timer_list *timer) 345int del_timer(struct timer_list *timer)
297{ 346{
347 timer_base_t *base;
298 unsigned long flags; 348 unsigned long flags;
299 tvec_base_t *base; 349 int ret = 0;
300 350
301 check_timer(timer); 351 check_timer(timer);
302 352
303repeat: 353 if (timer_pending(timer)) {
304 base = timer->base; 354 base = lock_timer_base(timer, &flags);
305 if (!base) 355 if (timer_pending(timer)) {
306 return 0; 356 detach_timer(timer, 1);
307 spin_lock_irqsave(&base->lock, flags); 357 ret = 1;
308 if (base != timer->base) { 358 }
309 spin_unlock_irqrestore(&base->lock, flags); 359 spin_unlock_irqrestore(&base->lock, flags);
310 goto repeat;
311 } 360 }
312 list_del(&timer->entry);
313 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
314 smp_wmb();
315 timer->base = NULL;
316 spin_unlock_irqrestore(&base->lock, flags);
317 361
318 return 1; 362 return ret;
319} 363}
320 364
321EXPORT_SYMBOL(del_timer); 365EXPORT_SYMBOL(del_timer);
322 366
323#ifdef CONFIG_SMP 367#ifdef CONFIG_SMP
324/*** 368/*
325 * del_timer_sync - deactivate a timer and wait for the handler to finish. 369 * This function tries to deactivate a timer. Upon successful (ret >= 0)
326 * @timer: the timer to be deactivated 370 * exit the timer is not queued and the handler is not running on any CPU.
327 *
328 * This function only differs from del_timer() on SMP: besides deactivating
329 * the timer it also makes sure the handler has finished executing on other
330 * CPUs.
331 *
332 * Synchronization rules: callers must prevent restarting of the timer,
333 * otherwise this function is meaningless. It must not be called from
334 * interrupt contexts. The caller must not hold locks which would prevent
335 * completion of the timer's handler. Upon exit the timer is not queued and
336 * the handler is not running on any CPU.
337 *
338 * The function returns whether it has deactivated a pending timer or not.
339 * 371 *
340 * del_timer_sync() is slow and complicated because it copes with timer 372 * It must not be called from interrupt contexts.
341 * handlers which re-arm the timer (periodic timers). If the timer handler
342 * is known to not do this (a single shot timer) then use
343 * del_singleshot_timer_sync() instead.
344 */ 373 */
345int del_timer_sync(struct timer_list *timer) 374int try_to_del_timer_sync(struct timer_list *timer)
346{ 375{
347 tvec_base_t *base; 376 timer_base_t *base;
348 int i, ret = 0; 377 unsigned long flags;
378 int ret = -1;
349 379
350 check_timer(timer); 380 base = lock_timer_base(timer, &flags);
351 381
352del_again: 382 if (base->running_timer == timer)
353 ret += del_timer(timer); 383 goto out;
354 384
355 for_each_online_cpu(i) { 385 ret = 0;
356 base = &per_cpu(tvec_bases, i); 386 if (timer_pending(timer)) {
357 if (base->running_timer == timer) { 387 detach_timer(timer, 1);
358 while (base->running_timer == timer) { 388 ret = 1;
359 cpu_relax();
360 preempt_check_resched();
361 }
362 break;
363 }
364 } 389 }
365 smp_rmb(); 390out:
366 if (timer_pending(timer)) 391 spin_unlock_irqrestore(&base->lock, flags);
367 goto del_again;
368 392
369 return ret; 393 return ret;
370} 394}
371EXPORT_SYMBOL(del_timer_sync);
372 395
373/*** 396/***
374 * del_singleshot_timer_sync - deactivate a non-recursive timer 397 * del_timer_sync - deactivate a timer and wait for the handler to finish.
375 * @timer: the timer to be deactivated 398 * @timer: the timer to be deactivated
376 * 399 *
377 * This function is an optimization of del_timer_sync for the case where the 400 * This function only differs from del_timer() on SMP: besides deactivating
378 * caller can guarantee the timer does not reschedule itself in its timer 401 * the timer it also makes sure the handler has finished executing on other
379 * function. 402 * CPUs.
380 * 403 *
381 * Synchronization rules: callers must prevent restarting of the timer, 404 * Synchronization rules: callers must prevent restarting of the timer,
382 * otherwise this function is meaningless. It must not be called from 405 * otherwise this function is meaningless. It must not be called from
383 * interrupt contexts. The caller must not hold locks which wold prevent 406 * interrupt contexts. The caller must not hold locks which would prevent
384 * completion of the timer's handler. Upon exit the timer is not queued and 407 * completion of the timer's handler. The timer's handler must not call
385 * the handler is not running on any CPU. 408 * add_timer_on(). Upon exit the timer is not queued and the handler is
409 * not running on any CPU.
386 * 410 *
387 * The function returns whether it has deactivated a pending timer or not. 411 * The function returns whether it has deactivated a pending timer or not.
388 */ 412 */
389int del_singleshot_timer_sync(struct timer_list *timer) 413int del_timer_sync(struct timer_list *timer)
390{ 414{
391 int ret = del_timer(timer); 415 check_timer(timer);
392 416
393 if (!ret) { 417 for (;;) {
394 ret = del_timer_sync(timer); 418 int ret = try_to_del_timer_sync(timer);
395 BUG_ON(ret); 419 if (ret >= 0)
420 return ret;
396 } 421 }
397
398 return ret;
399} 422}
400EXPORT_SYMBOL(del_singleshot_timer_sync); 423
424EXPORT_SYMBOL(del_timer_sync);
401#endif 425#endif
402 426
403static int cascade(tvec_base_t *base, tvec_t *tv, int index) 427static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
415 struct timer_list *tmp; 439 struct timer_list *tmp;
416 440
417 tmp = list_entry(curr, struct timer_list, entry); 441 tmp = list_entry(curr, struct timer_list, entry);
418 BUG_ON(tmp->base != base); 442 BUG_ON(tmp->base != &base->t_base);
419 curr = curr->next; 443 curr = curr->next;
420 internal_add_timer(base, tmp); 444 internal_add_timer(base, tmp);
421 } 445 }
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base)
437{ 461{
438 struct timer_list *timer; 462 struct timer_list *timer;
439 463
440 spin_lock_irq(&base->lock); 464 spin_lock_irq(&base->t_base.lock);
441 while (time_after_eq(jiffies, base->timer_jiffies)) { 465 while (time_after_eq(jiffies, base->timer_jiffies)) {
442 struct list_head work_list = LIST_HEAD_INIT(work_list); 466 struct list_head work_list = LIST_HEAD_INIT(work_list);
443 struct list_head *head = &work_list; 467 struct list_head *head = &work_list;
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base)
453 cascade(base, &base->tv5, INDEX(3)); 477 cascade(base, &base->tv5, INDEX(3));
454 ++base->timer_jiffies; 478 ++base->timer_jiffies;
455 list_splice_init(base->tv1.vec + index, &work_list); 479 list_splice_init(base->tv1.vec + index, &work_list);
456repeat: 480 while (!list_empty(head)) {
457 if (!list_empty(head)) {
458 void (*fn)(unsigned long); 481 void (*fn)(unsigned long);
459 unsigned long data; 482 unsigned long data;
460 483
@@ -462,25 +485,26 @@ repeat:
462 fn = timer->function; 485 fn = timer->function;
463 data = timer->data; 486 data = timer->data;
464 487
465 list_del(&timer->entry);
466 set_running_timer(base, timer); 488 set_running_timer(base, timer);
467 smp_wmb(); 489 detach_timer(timer, 1);
468 timer->base = NULL; 490 spin_unlock_irq(&base->t_base.lock);
469 spin_unlock_irq(&base->lock);
470 { 491 {
471 u32 preempt_count = preempt_count(); 492 int preempt_count = preempt_count();
472 fn(data); 493 fn(data);
473 if (preempt_count != preempt_count()) { 494 if (preempt_count != preempt_count()) {
474 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); 495 printk(KERN_WARNING "huh, entered %p "
496 "with preempt_count %08x, exited"
497 " with %08x?\n",
498 fn, preempt_count,
499 preempt_count());
475 BUG(); 500 BUG();
476 } 501 }
477 } 502 }
478 spin_lock_irq(&base->lock); 503 spin_lock_irq(&base->t_base.lock);
479 goto repeat;
480 } 504 }
481 } 505 }
482 set_running_timer(base, NULL); 506 set_running_timer(base, NULL);
483 spin_unlock_irq(&base->lock); 507 spin_unlock_irq(&base->t_base.lock);
484} 508}
485 509
486#ifdef CONFIG_NO_IDLE_HZ 510#ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void)
499 int i, j; 523 int i, j;
500 524
501 base = &__get_cpu_var(tvec_bases); 525 base = &__get_cpu_var(tvec_bases);
502 spin_lock(&base->lock); 526 spin_lock(&base->t_base.lock);
503 expires = base->timer_jiffies + (LONG_MAX >> 1); 527 expires = base->timer_jiffies + (LONG_MAX >> 1);
504 list = 0; 528 list = 0;
505 529
@@ -547,7 +571,7 @@ found:
547 expires = nte->expires; 571 expires = nte->expires;
548 } 572 }
549 } 573 }
550 spin_unlock(&base->lock); 574 spin_unlock(&base->t_base.lock);
551 return expires; 575 return expires;
552} 576}
553#endif 577#endif
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu)
1286{ 1310{
1287 int j; 1311 int j;
1288 tvec_base_t *base; 1312 tvec_base_t *base;
1289 1313
1290 base = &per_cpu(tvec_bases, cpu); 1314 base = &per_cpu(tvec_bases, cpu);
1291 spin_lock_init(&base->lock); 1315 spin_lock_init(&base->t_base.lock);
1292 for (j = 0; j < TVN_SIZE; j++) { 1316 for (j = 0; j < TVN_SIZE; j++) {
1293 INIT_LIST_HEAD(base->tv5.vec + j); 1317 INIT_LIST_HEAD(base->tv5.vec + j);
1294 INIT_LIST_HEAD(base->tv4.vec + j); 1318 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu)
1302} 1326}
1303 1327
1304#ifdef CONFIG_HOTPLUG_CPU 1328#ifdef CONFIG_HOTPLUG_CPU
1305static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1329static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1306{ 1330{
1307 struct timer_list *timer; 1331 struct timer_list *timer;
1308 1332
1309 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1310 timer = list_entry(head->next, struct timer_list, entry); 1334 timer = list_entry(head->next, struct timer_list, entry);
1311 /* We're locking backwards from __mod_timer order here, 1335 detach_timer(timer, 0);
1312 beware deadlock. */ 1336 timer->base = &new_base->t_base;
1313 if (!spin_trylock(&timer->lock))
1314 return 0;
1315 list_del(&timer->entry);
1316 internal_add_timer(new_base, timer); 1337 internal_add_timer(new_base, timer);
1317 timer->base = new_base;
1318 spin_unlock(&timer->lock);
1319 } 1338 }
1320 return 1;
1321} 1339}
1322 1340
1323static void __devinit migrate_timers(int cpu) 1341static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu)
1331 new_base = &get_cpu_var(tvec_bases); 1349 new_base = &get_cpu_var(tvec_bases);
1332 1350
1333 local_irq_disable(); 1351 local_irq_disable();
1334again: 1352 spin_lock(&new_base->t_base.lock);
1335 /* Prevent deadlocks via ordering by old_base < new_base. */ 1353 spin_lock(&old_base->t_base.lock);
1336 if (old_base < new_base) {
1337 spin_lock(&new_base->lock);
1338 spin_lock(&old_base->lock);
1339 } else {
1340 spin_lock(&old_base->lock);
1341 spin_lock(&new_base->lock);
1342 }
1343 1354
1344 if (old_base->running_timer) 1355 if (old_base->t_base.running_timer)
1345 BUG(); 1356 BUG();
1346 for (i = 0; i < TVR_SIZE; i++) 1357 for (i = 0; i < TVR_SIZE; i++)
1347 if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) 1358 migrate_timer_list(new_base, old_base->tv1.vec + i);
1348 goto unlock_again; 1359 for (i = 0; i < TVN_SIZE; i++) {
1349 for (i = 0; i < TVN_SIZE; i++) 1360 migrate_timer_list(new_base, old_base->tv2.vec + i);
1350 if (!migrate_timer_list(new_base, old_base->tv2.vec + i) 1361 migrate_timer_list(new_base, old_base->tv3.vec + i);
1351 || !migrate_timer_list(new_base, old_base->tv3.vec + i) 1362 migrate_timer_list(new_base, old_base->tv4.vec + i);
1352 || !migrate_timer_list(new_base, old_base->tv4.vec + i) 1363 migrate_timer_list(new_base, old_base->tv5.vec + i);
1353 || !migrate_timer_list(new_base, old_base->tv5.vec + i)) 1364 }
1354 goto unlock_again; 1365
1355 spin_unlock(&old_base->lock); 1366 spin_unlock(&old_base->t_base.lock);
1356 spin_unlock(&new_base->lock); 1367 spin_unlock(&new_base->t_base.lock);
1357 local_irq_enable(); 1368 local_irq_enable();
1358 put_cpu_var(tvec_bases); 1369 put_cpu_var(tvec_bases);
1359 return;
1360
1361unlock_again:
1362 /* Avoid deadlock with __mod_timer, by backing off. */
1363 spin_unlock(&old_base->lock);
1364 spin_unlock(&new_base->lock);
1365 cpu_relax();
1366 goto again;
1367} 1370}
1368#endif /* CONFIG_HOTPLUG_CPU */ 1371#endif /* CONFIG_HOTPLUG_CPU */
1369 1372
@@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs)
1594EXPORT_SYMBOL(msleep); 1597EXPORT_SYMBOL(msleep);
1595 1598
1596/** 1599/**
1597 * msleep_interruptible - sleep waiting for waitqueue interruptions 1600 * msleep_interruptible - sleep waiting for signals
1598 * @msecs: Time in milliseconds to sleep for 1601 * @msecs: Time in milliseconds to sleep for
1599 */ 1602 */
1600unsigned long msleep_interruptible(unsigned int msecs) 1603unsigned long msleep_interruptible(unsigned int msecs)