aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz46
-rw-r--r--kernel/Kconfig.preempt65
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/capability.c20
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c204
-rw-r--r--kernel/crash_dump.c61
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c30
-rw-r--r--kernel/irq/autoprobe.c9
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/spurious.c115
-rw-r--r--kernel/itimer.c45
-rw-r--r--kernel/kexec.c1063
-rw-r--r--kernel/kmod.c17
-rw-r--r--kernel/kprobes.c360
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/module.c112
-rw-r--r--kernel/panic.c32
-rw-r--r--kernel/posix-timers.c52
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/disk.c56
-rw-r--r--kernel/power/main.c32
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c30
-rw-r--r--kernel/power/smp.c89
-rw-r--r--kernel/power/swsusp.c131
-rw-r--r--kernel/printk.c15
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c1086
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/sys.c202
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c38
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/timer.c355
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c2
40 files changed, 3315 insertions, 1047 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
1#
2# Timer Interrupt Frequency Configuration
3#
4
5choice
6 prompt "Timer frequency"
7 default HZ_250
8 help
9 Allows the configuration of the timer frequency. It is customary
10 to have the timer interrupt run at 1000 HZ but 100 HZ may be more
11 beneficial for servers and NUMA systems that do not need to have
12 a fast response for user interaction and that may experience bus
13 contention and cacheline bounces as a result of timer interrupts.
14 Note that the timer interrupt occurs on each processor in an SMP
15 environment leading to NR_CPUS * HZ number of timer interrupts
16 per second.
17
18
19 config HZ_100
20 bool "100 HZ"
21 help
22 100 HZ is a typical choice for servers, SMP and NUMA systems
23 with lots of processors that may show reduced performance if
24 too many timer interrupts are occurring.
25
26 config HZ_250
27 bool "250 HZ"
28 help
29 250 HZ is a good compromise choice allowing server performance
30 while also showing good interactive responsiveness even
31 on SMP and NUMA systems.
32
33 config HZ_1000
34 bool "1000 HZ"
35 help
36 1000 HZ is the preferred choice for desktop systems and other
37 systems requiring fast interactive responses to events.
38
39endchoice
40
41config HZ
42 int
43 default 100 if HZ_100
44 default 250 if HZ_250
45 default 1000 if HZ_1000
46
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
1
2choice
3 prompt "Preemption Model"
4 default PREEMPT_NONE
5
6config PREEMPT_NONE
7 bool "No Forced Preemption (Server)"
8 help
9 This is the traditional Linux preemption model, geared towards
10 throughput. It will still provide good latencies most of the
11 time, but there are no guarantees and occasional longer delays
12 are possible.
13
14 Select this option if you are building a kernel for a server or
15 scientific/computation system, or if you want to maximize the
16 raw processing power of the kernel, irrespective of scheduling
17 latencies.
18
19config PREEMPT_VOLUNTARY
20 bool "Voluntary Kernel Preemption (Desktop)"
21 help
22 This option reduces the latency of the kernel by adding more
23 "explicit preemption points" to the kernel code. These new
24 preemption points have been selected to reduce the maximum
25 latency of rescheduling, providing faster application reactions,
26 at the cost of slighly lower throughput.
27
28 This allows reaction to interactive events by allowing a
29 low priority process to voluntarily preempt itself even if it
30 is in kernel mode executing a system call. This allows
31 applications to run more 'smoothly' even when the system is
32 under load.
33
34 Select this if you are building a kernel for a desktop system.
35
36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 help
39 This option reduces the latency of the kernel by making
40 all kernel code (that is not executing in a critical section)
41 preemptible. This allows reaction to interactive events by
42 permitting a low priority process to be preempted involuntarily
43 even if it is in kernel mode executing a system call and would
44 otherwise not be about to reach a natural preemption point.
45 This allows applications to run more 'smoothly' even when the
46 system is under load, at the cost of slighly lower throughput
47 and a slight runtime overhead to kernel code.
48
49 Select this if you are building a kernel for a desktop or
50 embedded system with latency requirements in the milliseconds
51 range.
52
53endchoice
54
55config PREEMPT_BKL
56 bool "Preempt The Big Kernel Lock"
57 depends on SMP || PREEMPT
58 default y
59 help
60 This option reduces the latency of the kernel by making the
61 big kernel lock preemptible.
62
63 Say Y here if you are building a kernel for a desktop system.
64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 17obj-$(CONFIG_KALLSYMS) += kallsyms.o
18obj-$(CONFIG_PM) += power/ 18obj-$(CONFIG_PM) += power/
19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 19obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
20obj-$(CONFIG_KEXEC) += kexec.o
20obj-$(CONFIG_COMPAT) += compat.o 21obj-$(CONFIG_COMPAT) += compat.o
21obj-$(CONFIG_CPUSETS) += cpuset.o 22obj-$(CONFIG_CPUSETS) += cpuset.o
22obj-$(CONFIG_IKCONFIG) += configs.o 23obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
27obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
28obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
29obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
30obj-$(CONFIG_SECCOMP) += seccomp.o 32obj-$(CONFIG_SECCOMP) += seccomp.o
31 33
32ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 34ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/capability.c b/kernel/capability.c
index 64db1ee820c2..8986a37a67ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock);
31 * uninteresting and/or not to be changed. 31 * uninteresting and/or not to be changed.
32 */ 32 */
33 33
34/* 34/**
35 * sys_capget - get the capabilities of a given process. 35 * sys_capget - get the capabilities of a given process.
36 * @header: pointer to struct that contains capability version and
37 * target pid data
38 * @dataptr: pointer to struct that contains the effective, permitted,
39 * and inheritable capabilities that are returned
40 *
41 * Returns 0 on success and < 0 on error.
36 */ 42 */
37asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) 43asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
38{ 44{
@@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective,
141 return ret; 147 return ret;
142} 148}
143 149
144/* 150/**
145 * sys_capset - set capabilities for a given process, all processes, or all 151 * sys_capset - set capabilities for a process or a group of processes
152 * @header: pointer to struct that contains capability version and
153 * target pid data
154 * @data: pointer to struct that contains the effective, permitted,
155 * and inheritable capabilities
156 *
157 * Set capabilities for a given process, all processes, or all
146 * processes in a given process group. 158 * processes in a given process group.
147 * 159 *
148 * The restrictions on setting capabilities are specified as: 160 * The restrictions on setting capabilities are specified as:
@@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
152 * I: any raised capabilities must be a subset of the (old current) permitted 164 * I: any raised capabilities must be a subset of the (old current) permitted
153 * P: any raised capabilities must be a subset of the (old current) permitted 165 * P: any raised capabilities must be a subset of the (old current) permitted
154 * E: must be set to a subset of (new target) permitted 166 * E: must be set to a subset of (new target) permitted
167 *
168 * Returns 0 on success and < 0 on error.
155 */ 169 */
156asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 170asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
157{ 171{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
63{ 63{
64 int err; 64 int err;
65 65
66 /* Take offline: makes arch_cpu_down somewhat easier. */
67 cpu_clear(smp_processor_id(), cpu_online_map);
68
69 /* Ensure this CPU doesn't handle any more interrupts. */ 66 /* Ensure this CPU doesn't handle any more interrupts. */
70 err = __cpu_disable(); 67 err = __cpu_disable();
71 if (err < 0) 68 if (err < 0)
72 cpu_set(smp_processor_id(), cpu_online_map); 69 return err;
73 else
74 /* Force idle task to run as soon as we yield: it should
75 immediately notice cpu is offline and die quickly. */
76 sched_idle_next();
77 70
78 return err; 71 /* Force idle task to run as soon as we yield: it should
72 immediately notice cpu is offline and die quickly. */
73 sched_idle_next();
74 return 0;
79} 75}
80 76
81int cpu_down(unsigned int cpu) 77int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..8ab1b4e518b8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
228 228
229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) 229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
230{ 230{
231 struct qstr qstr; 231 struct dentry *d = lookup_one_len(name, parent, strlen(name));
232 struct dentry *d;
233
234 qstr.name = name;
235 qstr.len = strlen(name);
236 qstr.hash = full_name_hash(name, qstr.len);
237 d = lookup_hash(&qstr, parent);
238 if (!IS_ERR(d)) 232 if (!IS_ERR(d))
239 d->d_op = &cpuset_dops; 233 d->d_op = &cpuset_dops;
240 return d; 234 return d;
@@ -404,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
404 * to continue to serve a useful existence. Next time it's released, 398 * to continue to serve a useful existence. Next time it's released,
405 * we will get notified again, if it still has 'notify_on_release' set. 399 * we will get notified again, if it still has 'notify_on_release' set.
406 * 400 *
407 * Note final arg to call_usermodehelper() is 0 - that means 401 * The final arg to call_usermodehelper() is 0, which means don't
408 * don't wait. Since we are holding the global cpuset_sem here, 402 * wait. The separate /sbin/cpuset_release_agent task is forked by
409 * and we are asking another thread (started from keventd) to rmdir a 403 * call_usermodehelper(), then control in this thread returns here,
410 * cpuset, we can't wait - or we'd deadlock with the removing thread 404 * without waiting for the release agent task. We don't bother to
411 * on cpuset_sem. 405 * wait because the caller of this routine has no use for the exit
406 * status of the /sbin/cpuset_release_agent task, so no sense holding
407 * our caller up for that.
408 *
409 * The simple act of forking that task might require more memory,
410 * which might need cpuset_sem. So this routine must be called while
411 * cpuset_sem is not held, to avoid a possible deadlock. See also
412 * comments for check_for_release(), below.
412 */ 413 */
413 414
414static int cpuset_release_agent(char *cpuset_str) 415static void cpuset_release_agent(const char *pathbuf)
415{ 416{
416 char *argv[3], *envp[3]; 417 char *argv[3], *envp[3];
417 int i; 418 int i;
418 419
420 if (!pathbuf)
421 return;
422
419 i = 0; 423 i = 0;
420 argv[i++] = "/sbin/cpuset_release_agent"; 424 argv[i++] = "/sbin/cpuset_release_agent";
421 argv[i++] = cpuset_str; 425 argv[i++] = (char *)pathbuf;
422 argv[i] = NULL; 426 argv[i] = NULL;
423 427
424 i = 0; 428 i = 0;
@@ -427,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str)
427 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 431 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
428 envp[i] = NULL; 432 envp[i] = NULL;
429 433
430 return call_usermodehelper(argv[0], argv, envp, 0); 434 call_usermodehelper(argv[0], argv, envp, 0);
435 kfree(pathbuf);
431} 436}
432 437
433/* 438/*
434 * Either cs->count of using tasks transitioned to zero, or the 439 * Either cs->count of using tasks transitioned to zero, or the
435 * cs->children list of child cpusets just became empty. If this 440 * cs->children list of child cpusets just became empty. If this
436 * cs is notify_on_release() and now both the user count is zero and 441 * cs is notify_on_release() and now both the user count is zero and
437 * the list of children is empty, send notice to user land. 442 * the list of children is empty, prepare cpuset path in a kmalloc'd
443 * buffer, to be returned via ppathbuf, so that the caller can invoke
444 * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
445 * Call here with cpuset_sem held.
446 *
447 * This check_for_release() routine is responsible for kmalloc'ing
448 * pathbuf. The above cpuset_release_agent() is responsible for
449 * kfree'ing pathbuf. The caller of these routines is responsible
450 * for providing a pathbuf pointer, initialized to NULL, then
451 * calling check_for_release() with cpuset_sem held and the address
452 * of the pathbuf pointer, then dropping cpuset_sem, then calling
453 * cpuset_release_agent() with pathbuf, as set by check_for_release().
438 */ 454 */
439 455
440static void check_for_release(struct cpuset *cs) 456static void check_for_release(struct cpuset *cs, char **ppathbuf)
441{ 457{
442 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && 458 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
443 list_empty(&cs->children)) { 459 list_empty(&cs->children)) {
@@ -447,10 +463,9 @@ static void check_for_release(struct cpuset *cs)
447 if (!buf) 463 if (!buf)
448 return; 464 return;
449 if (cpuset_path(cs, buf, PAGE_SIZE) < 0) 465 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
450 goto out; 466 kfree(buf);
451 cpuset_release_agent(buf); 467 else
452out: 468 *ppathbuf = buf;
453 kfree(buf);
454 } 469 }
455} 470}
456 471
@@ -601,10 +616,75 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
601 return 0; 616 return 0;
602} 617}
603 618
619/*
620 * For a given cpuset cur, partition the system as follows
621 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
622 * exclusive child cpusets
623 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
624 * exclusive child cpusets
625 * Build these two partitions by calling partition_sched_domains
626 *
627 * Call with cpuset_sem held. May nest a call to the
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */
630
631/*
632 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
633 * Disable letting 'cpu_exclusive' cpusets define dynamic sched
634 * domains, until the sched domain can handle partial nodes.
635 * Remove this #if hackery when sched domains fixed.
636 */
637#if 0
638static void update_cpu_domains(struct cpuset *cur)
639{
640 struct cpuset *c, *par = cur->parent;
641 cpumask_t pspan, cspan;
642
643 if (par == NULL || cpus_empty(cur->cpus_allowed))
644 return;
645
646 /*
647 * Get all cpus from parent's cpus_allowed not part of exclusive
648 * children
649 */
650 pspan = par->cpus_allowed;
651 list_for_each_entry(c, &par->children, sibling) {
652 if (is_cpu_exclusive(c))
653 cpus_andnot(pspan, pspan, c->cpus_allowed);
654 }
655 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
656 cpus_or(pspan, pspan, cur->cpus_allowed);
657 if (cpus_equal(pspan, cur->cpus_allowed))
658 return;
659 cspan = CPU_MASK_NONE;
660 } else {
661 if (cpus_empty(pspan))
662 return;
663 cspan = cur->cpus_allowed;
664 /*
665 * Get all cpus from current cpuset's cpus_allowed not part
666 * of exclusive children
667 */
668 list_for_each_entry(c, &cur->children, sibling) {
669 if (is_cpu_exclusive(c))
670 cpus_andnot(cspan, cspan, c->cpus_allowed);
671 }
672 }
673
674 lock_cpu_hotplug();
675 partition_sched_domains(&pspan, &cspan);
676 unlock_cpu_hotplug();
677}
678#else
679static void update_cpu_domains(struct cpuset *cur)
680{
681}
682#endif
683
604static int update_cpumask(struct cpuset *cs, char *buf) 684static int update_cpumask(struct cpuset *cs, char *buf)
605{ 685{
606 struct cpuset trialcs; 686 struct cpuset trialcs;
607 int retval; 687 int retval, cpus_unchanged;
608 688
609 trialcs = *cs; 689 trialcs = *cs;
610 retval = cpulist_parse(buf, trialcs.cpus_allowed); 690 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -614,9 +694,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
614 if (cpus_empty(trialcs.cpus_allowed)) 694 if (cpus_empty(trialcs.cpus_allowed))
615 return -ENOSPC; 695 return -ENOSPC;
616 retval = validate_change(cs, &trialcs); 696 retval = validate_change(cs, &trialcs);
617 if (retval == 0) 697 if (retval < 0)
618 cs->cpus_allowed = trialcs.cpus_allowed; 698 return retval;
619 return retval; 699 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
700 cs->cpus_allowed = trialcs.cpus_allowed;
701 if (is_cpu_exclusive(cs) && !cpus_unchanged)
702 update_cpu_domains(cs);
703 return 0;
620} 704}
621 705
622static int update_nodemask(struct cpuset *cs, char *buf) 706static int update_nodemask(struct cpuset *cs, char *buf)
@@ -652,7 +736,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
652{ 736{
653 int turning_on; 737 int turning_on;
654 struct cpuset trialcs; 738 struct cpuset trialcs;
655 int err; 739 int err, cpu_exclusive_changed;
656 740
657 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 741 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
658 742
@@ -663,23 +747,28 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
663 clear_bit(bit, &trialcs.flags); 747 clear_bit(bit, &trialcs.flags);
664 748
665 err = validate_change(cs, &trialcs); 749 err = validate_change(cs, &trialcs);
666 if (err == 0) { 750 if (err < 0)
667 if (turning_on) 751 return err;
668 set_bit(bit, &cs->flags); 752 cpu_exclusive_changed =
669 else 753 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
670 clear_bit(bit, &cs->flags); 754 if (turning_on)
671 } 755 set_bit(bit, &cs->flags);
672 return err; 756 else
757 clear_bit(bit, &cs->flags);
758
759 if (cpu_exclusive_changed)
760 update_cpu_domains(cs);
761 return 0;
673} 762}
674 763
675static int attach_task(struct cpuset *cs, char *buf) 764static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
676{ 765{
677 pid_t pid; 766 pid_t pid;
678 struct task_struct *tsk; 767 struct task_struct *tsk;
679 struct cpuset *oldcs; 768 struct cpuset *oldcs;
680 cpumask_t cpus; 769 cpumask_t cpus;
681 770
682 if (sscanf(buf, "%d", &pid) != 1) 771 if (sscanf(pidbuf, "%d", &pid) != 1)
683 return -EIO; 772 return -EIO;
684 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 773 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
685 return -ENOSPC; 774 return -ENOSPC;
@@ -722,7 +811,7 @@ static int attach_task(struct cpuset *cs, char *buf)
722 811
723 put_task_struct(tsk); 812 put_task_struct(tsk);
724 if (atomic_dec_and_test(&oldcs->count)) 813 if (atomic_dec_and_test(&oldcs->count))
725 check_for_release(oldcs); 814 check_for_release(oldcs, ppathbuf);
726 return 0; 815 return 0;
727} 816}
728 817
@@ -746,6 +835,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
746 struct cftype *cft = __d_cft(file->f_dentry); 835 struct cftype *cft = __d_cft(file->f_dentry);
747 cpuset_filetype_t type = cft->private; 836 cpuset_filetype_t type = cft->private;
748 char *buffer; 837 char *buffer;
838 char *pathbuf = NULL;
749 int retval = 0; 839 int retval = 0;
750 840
751 /* Crude upper limit on largest legitimate cpulist user might write. */ 841 /* Crude upper limit on largest legitimate cpulist user might write. */
@@ -786,7 +876,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
786 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 876 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
787 break; 877 break;
788 case FILE_TASKLIST: 878 case FILE_TASKLIST:
789 retval = attach_task(cs, buffer); 879 retval = attach_task(cs, buffer, &pathbuf);
790 break; 880 break;
791 default: 881 default:
792 retval = -EINVAL; 882 retval = -EINVAL;
@@ -797,6 +887,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
797 retval = nbytes; 887 retval = nbytes;
798out2: 888out2:
799 up(&cpuset_sem); 889 up(&cpuset_sem);
890 cpuset_release_agent(pathbuf);
800out1: 891out1:
801 kfree(buffer); 892 kfree(buffer);
802 return retval; 893 return retval;
@@ -1302,6 +1393,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1302 struct cpuset *cs = dentry->d_fsdata; 1393 struct cpuset *cs = dentry->d_fsdata;
1303 struct dentry *d; 1394 struct dentry *d;
1304 struct cpuset *parent; 1395 struct cpuset *parent;
1396 char *pathbuf = NULL;
1305 1397
1306 /* the vfs holds both inode->i_sem already */ 1398 /* the vfs holds both inode->i_sem already */
1307 1399
@@ -1315,18 +1407,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1315 up(&cpuset_sem); 1407 up(&cpuset_sem);
1316 return -EBUSY; 1408 return -EBUSY;
1317 } 1409 }
1318 spin_lock(&cs->dentry->d_lock);
1319 parent = cs->parent; 1410 parent = cs->parent;
1320 set_bit(CS_REMOVED, &cs->flags); 1411 set_bit(CS_REMOVED, &cs->flags);
1412 if (is_cpu_exclusive(cs))
1413 update_cpu_domains(cs);
1321 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1414 list_del(&cs->sibling); /* delete my sibling from parent->children */
1322 if (list_empty(&parent->children)) 1415 if (list_empty(&parent->children))
1323 check_for_release(parent); 1416 check_for_release(parent, &pathbuf);
1417 spin_lock(&cs->dentry->d_lock);
1324 d = dget(cs->dentry); 1418 d = dget(cs->dentry);
1325 cs->dentry = NULL; 1419 cs->dentry = NULL;
1326 spin_unlock(&d->d_lock); 1420 spin_unlock(&d->d_lock);
1327 cpuset_d_remove_dir(d); 1421 cpuset_d_remove_dir(d);
1328 dput(d); 1422 dput(d);
1329 up(&cpuset_sem); 1423 up(&cpuset_sem);
1424 cpuset_release_agent(pathbuf);
1330 return 0; 1425 return 0;
1331} 1426}
1332 1427
@@ -1383,10 +1478,10 @@ void __init cpuset_init_smp(void)
1383 1478
1384/** 1479/**
1385 * cpuset_fork - attach newly forked task to its parents cpuset. 1480 * cpuset_fork - attach newly forked task to its parents cpuset.
1386 * @p: pointer to task_struct of forking parent process. 1481 * @tsk: pointer to task_struct of forking parent process.
1387 * 1482 *
1388 * Description: By default, on fork, a task inherits its 1483 * Description: By default, on fork, a task inherits its
1389 * parents cpuset. The pointer to the shared cpuset is 1484 * parent's cpuset. The pointer to the shared cpuset is
1390 * automatically copied in fork.c by dup_task_struct(). 1485 * automatically copied in fork.c by dup_task_struct().
1391 * This cpuset_fork() routine need only increment the usage 1486 * This cpuset_fork() routine need only increment the usage
1392 * counter in that cpuset. 1487 * counter in that cpuset.
@@ -1414,7 +1509,6 @@ void cpuset_fork(struct task_struct *tsk)
1414 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1509 * by the cpuset_sem semaphore. If you don't hold cpuset_sem,
1415 * then a zero cpuset use count is a license to any other task to 1510 * then a zero cpuset use count is a license to any other task to
1416 * nuke the cpuset immediately. 1511 * nuke the cpuset immediately.
1417 *
1418 **/ 1512 **/
1419 1513
1420void cpuset_exit(struct task_struct *tsk) 1514void cpuset_exit(struct task_struct *tsk)
@@ -1427,10 +1521,13 @@ void cpuset_exit(struct task_struct *tsk)
1427 task_unlock(tsk); 1521 task_unlock(tsk);
1428 1522
1429 if (notify_on_release(cs)) { 1523 if (notify_on_release(cs)) {
1524 char *pathbuf = NULL;
1525
1430 down(&cpuset_sem); 1526 down(&cpuset_sem);
1431 if (atomic_dec_and_test(&cs->count)) 1527 if (atomic_dec_and_test(&cs->count))
1432 check_for_release(cs); 1528 check_for_release(cs, &pathbuf);
1433 up(&cpuset_sem); 1529 up(&cpuset_sem);
1530 cpuset_release_agent(pathbuf);
1434 } else { 1531 } else {
1435 atomic_dec(&cs->count); 1532 atomic_dec(&cs->count);
1436 } 1533 }
@@ -1464,7 +1561,9 @@ void cpuset_init_current_mems_allowed(void)
1464 current->mems_allowed = NODE_MASK_ALL; 1561 current->mems_allowed = NODE_MASK_ALL;
1465} 1562}
1466 1563
1467/* 1564/**
1565 * cpuset_update_current_mems_allowed - update mems parameters to new values
1566 *
1468 * If the current tasks cpusets mems_allowed changed behind our backs, 1567 * If the current tasks cpusets mems_allowed changed behind our backs,
1469 * update current->mems_allowed and mems_generation to the new value. 1568 * update current->mems_allowed and mems_generation to the new value.
1470 * Do not call this routine if in_interrupt(). 1569 * Do not call this routine if in_interrupt().
@@ -1483,13 +1582,20 @@ void cpuset_update_current_mems_allowed(void)
1483 } 1582 }
1484} 1583}
1485 1584
1585/**
1586 * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
1587 * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
1588 */
1486void cpuset_restrict_to_mems_allowed(unsigned long *nodes) 1589void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1487{ 1590{
1488 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), 1591 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1489 MAX_NUMNODES); 1592 MAX_NUMNODES);
1490} 1593}
1491 1594
1492/* 1595/**
1596 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
1597 * @zl: the zonelist to be checked
1598 *
1493 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1599 * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
1494 */ 1600 */
1495int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1601int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
@@ -1505,8 +1611,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1505 return 0; 1611 return 0;
1506} 1612}
1507 1613
1508/* 1614/**
1509 * Is 'current' valid, and is zone z allowed in current->mems_allowed? 1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
1616 * @z: zone in question
1617 *
1618 * Is zone z allowed in current->mems_allowed, or is
1619 * the CPU in interrupt context? (zone is always allowed in this case)
1510 */ 1620 */
1511int cpuset_zone_allowed(struct zone *z) 1621int cpuset_zone_allowed(struct zone *z)
1512{ 1622{
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..334c37f5218a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,61 @@
1/*
2 * kernel/crash_dump.c - Memory preserving reboot related code.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */
7
8#include <linux/smp_lock.h>
9#include <linux/errno.h>
10#include <linux/proc_fs.h>
11#include <linux/bootmem.h>
12#include <linux/highmem.h>
13#include <linux/crash_dump.h>
14
15#include <asm/io.h>
16#include <asm/uaccess.h>
17
18/* Stores the physical address of elf header of crash image. */
19unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
20
21/**
22 * copy_oldmem_page - copy one page from "oldmem"
23 * @pfn: page frame number to be copied
24 * @buf: target memory address for the copy; this can be in kernel address
25 * space or user address space (see @userbuf)
26 * @csize: number of bytes to copy
27 * @offset: offset in bytes into the page (based on pfn) to begin the copy
28 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
29 * otherwise @buf is in kernel address space, use memcpy().
30 *
31 * Copy a page from "oldmem". For this page, there is no pte mapped
32 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
33 */
34ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
35 size_t csize, unsigned long offset, int userbuf)
36{
37 void *page, *vaddr;
38
39 if (!csize)
40 return 0;
41
42 page = kmalloc(PAGE_SIZE, GFP_KERNEL);
43 if (!page)
44 return -ENOMEM;
45
46 vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
47 copy_page(page, vaddr);
48 kunmap_atomic(vaddr, KM_PTE0);
49
50 if (userbuf) {
51 if (copy_to_user(buf, (page + offset), csize)) {
52 kfree(page);
53 return -EFAULT;
54 }
55 } else {
56 memcpy(buf, (page + offset), csize);
57 }
58
59 kfree(page);
60 return csize;
61}
diff --git a/kernel/exit.c b/kernel/exit.c
index 2ef2ad540201..5b0fb9f09f21 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 72 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
73 __exit_signal(p); 73 __exit_signal(p);
74 __exit_sighand(p); 74 __exit_sighand(p);
75 /*
76 * Note that the fastpath in sys_times depends on __exit_signal having
77 * updated the counters before a task is removed from the tasklist of
78 * the process by __unhash_process.
79 */
75 __unhash_process(p); 80 __unhash_process(p);
76 81
77 /* 82 /*
@@ -779,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
779 784
780 profile_task_exit(tsk); 785 profile_task_exit(tsk);
781 786
787 WARN_ON(atomic_read(&tsk->fs_excl));
788
782 if (unlikely(in_interrupt())) 789 if (unlikely(in_interrupt()))
783 panic("Aiee, killing interrupt handler!"); 790 panic("Aiee, killing interrupt handler!");
784 if (unlikely(!tsk->pid)) 791 if (unlikely(!tsk->pid))
@@ -793,6 +800,17 @@ fastcall NORET_TYPE void do_exit(long code)
793 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); 800 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
794 } 801 }
795 802
803 /*
804 * We're taking recursive faults here in do_exit. Safest is to just
805 * leave this task alone and wait for reboot.
806 */
807 if (unlikely(tsk->flags & PF_EXITING)) {
808 printk(KERN_ALERT
809 "Fixing recursive fault but reboot is needed!\n");
810 set_current_state(TASK_UNINTERRUPTIBLE);
811 schedule();
812 }
813
796 tsk->flags |= PF_EXITING; 814 tsk->flags |= PF_EXITING;
797 815
798 /* 816 /*
@@ -811,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code)
811 acct_update_integrals(tsk); 829 acct_update_integrals(tsk);
812 update_mem_hiwater(tsk); 830 update_mem_hiwater(tsk);
813 group_dead = atomic_dec_and_test(&tsk->signal->live); 831 group_dead = atomic_dec_and_test(&tsk->signal->live);
814 if (group_dead) 832 if (group_dead) {
833 del_timer_sync(&tsk->signal->real_timer);
815 acct_process(code); 834 acct_process(code);
835 }
816 exit_mm(tsk); 836 exit_mm(tsk);
817 837
818 exit_sem(tsk); 838 exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index a28d11e10877..b65187f0c74e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -208,8 +208,10 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
208 struct file *file; 208 struct file *file;
209 209
210 if (mpnt->vm_flags & VM_DONTCOPY) { 210 if (mpnt->vm_flags & VM_DONTCOPY) {
211 long pages = vma_pages(mpnt);
212 mm->total_vm -= pages;
211 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 213 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
212 -vma_pages(mpnt)); 214 -pages);
213 continue; 215 continue;
214 } 216 }
215 charge = 0; 217 charge = 0;
@@ -1003,9 +1005,6 @@ static task_t *copy_process(unsigned long clone_flags,
1003 p->pdeath_signal = 0; 1005 p->pdeath_signal = 0;
1004 p->exit_state = 0; 1006 p->exit_state = 0;
1005 1007
1006 /* Perform scheduler related setup */
1007 sched_fork(p);
1008
1009 /* 1008 /*
1010 * Ok, make it visible to the rest of the system. 1009 * Ok, make it visible to the rest of the system.
1011 * We dont wake it up yet. 1010 * We dont wake it up yet.
@@ -1014,18 +1013,24 @@ static task_t *copy_process(unsigned long clone_flags,
1014 INIT_LIST_HEAD(&p->ptrace_children); 1013 INIT_LIST_HEAD(&p->ptrace_children);
1015 INIT_LIST_HEAD(&p->ptrace_list); 1014 INIT_LIST_HEAD(&p->ptrace_list);
1016 1015
1016 /* Perform scheduler related setup. Assign this task to a CPU. */
1017 sched_fork(p, clone_flags);
1018
1017 /* Need tasklist lock for parent etc handling! */ 1019 /* Need tasklist lock for parent etc handling! */
1018 write_lock_irq(&tasklist_lock); 1020 write_lock_irq(&tasklist_lock);
1019 1021
1020 /* 1022 /*
1021 * The task hasn't been attached yet, so cpus_allowed mask cannot 1023 * The task hasn't been attached yet, so its cpus_allowed mask will
1022 * have changed. The cpus_allowed mask of the parent may have 1024 * not be changed, nor will its assigned CPU.
1023 * changed after it was copied first time, and it may then move to 1025 *
1024 * another CPU - so we re-copy it here and set the child's CPU to 1026 * The cpus_allowed mask of the parent may have changed after it was
1025 * the parent's CPU. This avoids alot of nasty races. 1027 * copied first time - so re-copy it here, then check the child's CPU
1028 * to ensure it is on a valid CPU (and if not, just force it back to
1029 * parent's CPU). This avoids alot of nasty races.
1026 */ 1030 */
1027 p->cpus_allowed = current->cpus_allowed; 1031 p->cpus_allowed = current->cpus_allowed;
1028 set_task_cpu(p, smp_processor_id()); 1032 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
1033 set_task_cpu(p, smp_processor_id());
1029 1034
1030 /* 1035 /*
1031 * Check for pending SIGKILL! The new thread should not be allowed 1036 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1087,6 +1092,11 @@ static task_t *copy_process(unsigned long clone_flags,
1087 spin_unlock(&current->sighand->siglock); 1092 spin_unlock(&current->sighand->siglock);
1088 } 1093 }
1089 1094
1095 /*
1096 * inherit ioprio
1097 */
1098 p->ioprio = current->ioprio;
1099
1090 SET_LINKS(p); 1100 SET_LINKS(p);
1091 if (unlikely(p->ptrace & PT_PTRACED)) 1101 if (unlikely(p->ptrace & PT_PTRACED))
1092 __ptrace_link(p, current->parent); 1102 __ptrace_link(p, current->parent);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 98d62d8efeaf..3467097ca61a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -9,6 +9,7 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h>
12 13
13/* 14/*
14 * Autodetection depends on the fact that any interrupt that 15 * Autodetection depends on the fact that any interrupt that
@@ -26,7 +27,7 @@ static DECLARE_MUTEX(probe_sem);
26 */ 27 */
27unsigned long probe_irq_on(void) 28unsigned long probe_irq_on(void)
28{ 29{
29 unsigned long val, delay; 30 unsigned long val;
30 irq_desc_t *desc; 31 irq_desc_t *desc;
31 unsigned int i; 32 unsigned int i;
32 33
@@ -45,8 +46,7 @@ unsigned long probe_irq_on(void)
45 } 46 }
46 47
47 /* Wait for longstanding interrupts to trigger. */ 48 /* Wait for longstanding interrupts to trigger. */
48 for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) 49 msleep(20);
49 /* about 20ms delay */ barrier();
50 50
51 /* 51 /*
52 * enable any unassigned irqs 52 * enable any unassigned irqs
@@ -68,8 +68,7 @@ unsigned long probe_irq_on(void)
68 /* 68 /*
69 * Wait for spurious interrupts to trigger 69 * Wait for spurious interrupts to trigger
70 */ 70 */
71 for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) 71 msleep(100);
72 /* about 100ms delay */ barrier();
73 72
74 /* 73 /*
75 * Now filter out any obviously spurious interrupts 74 * Now filter out any obviously spurious interrupts
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 436c7d93c00a..c29f83c16497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -172,7 +172,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
172 172
173 spin_lock(&desc->lock); 173 spin_lock(&desc->lock);
174 if (!noirqdebug) 174 if (!noirqdebug)
175 note_interrupt(irq, desc, action_ret); 175 note_interrupt(irq, desc, action_ret, regs);
176 if (likely(!(desc->status & IRQ_PENDING))) 176 if (likely(!(desc->status & IRQ_PENDING)))
177 break; 177 break;
178 desc->status &= ~IRQ_PENDING; 178 desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..7df9abd5ec86 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,6 +11,83 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup;
15
16/*
17 * Recovery handler for misrouted interrupts.
18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs)
21{
22 int i;
23 irq_desc_t *desc;
24 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */
26
27 for(i = 1; i < NR_IRQS; i++) {
28 struct irqaction *action;
29
30 if (i == irq) /* Already tried */
31 continue;
32 desc = &irq_desc[i];
33 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) {
37 /*
38 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too
40 */
41 if (desc->action && (desc->action->flags & SA_SHIRQ))
42 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock);
44 continue;
45 }
46 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS;
48 spin_unlock(&desc->lock);
49 while (action) {
50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) {
52 if (action->handler(i, action->dev_id, regs) ==
53 IRQ_HANDLED)
54 ok = 1;
55 }
56 action = action->next;
57 }
58 local_irq_disable();
59 /* Now clean up the flags */
60 spin_lock(&desc->lock);
61 action = desc->action;
62
63 /*
64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk
66 */
67
68 while ((desc->status & IRQ_PENDING) && action) {
69 /*
70 * Perform real IRQ processing for the IRQ we deferred
71 */
72 work = 1;
73 spin_unlock(&desc->lock);
74 handle_IRQ_event(i, regs, action);
75 spin_lock(&desc->lock);
76 desc->status &= ~IRQ_PENDING;
77 }
78 desc->status &= ~IRQ_INPROGRESS;
79 /*
80 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too
82 */
83 if(work)
84 desc->handler->end(i);
85 spin_unlock(&desc->lock);
86 }
87 /* So the caller can adjust the irq error counts */
88 return ok;
89}
90
14/* 91/*
15 * If 99,900 of the previous 100,000 interrupts have not been handled 92 * If 99,900 of the previous 100,000 interrupts have not been handled
16 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 93 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -31,7 +108,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
31 printk(KERN_ERR "irq event %d: bogus return value %x\n", 108 printk(KERN_ERR "irq event %d: bogus return value %x\n",
32 irq, action_ret); 109 irq, action_ret);
33 } else { 110 } else {
34 printk(KERN_ERR "irq %d: nobody cared!\n", irq); 111 printk(KERN_ERR "irq %d: nobody cared (try booting with "
112 "the \"irqpoll\" option)\n", irq);
35 } 113 }
36 dump_stack(); 114 dump_stack();
37 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
@@ -45,7 +123,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
45 } 123 }
46} 124}
47 125
48void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
49{ 127{
50 static int count = 100; 128 static int count = 100;
51 129
@@ -55,7 +133,8 @@ void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
55 } 133 }
56} 134}
57 135
58void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
137 struct pt_regs *regs)
59{ 138{
60 if (action_ret != IRQ_HANDLED) { 139 if (action_ret != IRQ_HANDLED) {
61 desc->irqs_unhandled++; 140 desc->irqs_unhandled++;
@@ -63,6 +142,15 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
63 report_bad_irq(irq, desc, action_ret); 142 report_bad_irq(irq, desc, action_ret);
64 } 143 }
65 144
145 if (unlikely(irqfixup)) {
146 /* Don't punish working computers */
147 if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
148 int ok = misrouted_irq(irq, regs);
149 if (action_ret == IRQ_NONE)
150 desc->irqs_unhandled -= ok;
151 }
152 }
153
66 desc->irq_count++; 154 desc->irq_count++;
67 if (desc->irq_count < 100000) 155 if (desc->irq_count < 100000)
68 return; 156 return;
@@ -94,3 +182,24 @@ int __init noirqdebug_setup(char *str)
94 182
95__setup("noirqdebug", noirqdebug_setup); 183__setup("noirqdebug", noirqdebug_setup);
96 184
185static int __init irqfixup_setup(char *str)
186{
187 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n");
190 return 1;
191}
192
193__setup("irqfixup", irqfixup_setup);
194
195static int __init irqpoll_setup(char *str)
196{
197 irqfixup = 2;
198 printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
199 "enabled\n");
200 printk(KERN_WARNING "This may significantly impact system "
201 "performance\n");
202 return 1;
203}
204
205__setup("irqpoll", irqpoll_setup);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 1dc988e0d2c7..7c1b25e25e47 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
112 return error; 112 return error;
113} 113}
114 114
115/*
116 * Called with P->sighand->siglock held and P->signal->real_timer inactive.
117 * If interval is nonzero, arm the timer for interval ticks from now.
118 */
119static inline void it_real_arm(struct task_struct *p, unsigned long interval)
120{
121 p->signal->it_real_value = interval; /* XXX unnecessary field?? */
122 if (interval == 0)
123 return;
124 if (interval > (unsigned long) LONG_MAX)
125 interval = LONG_MAX;
126 /* the "+ 1" below makes sure that the timer doesn't go off before
127 * the interval requested. This could happen if
128 * time requested % (usecs per jiffy) is more than the usecs left
129 * in the current jiffy */
130 p->signal->real_timer.expires = jiffies + interval + 1;
131 add_timer(&p->signal->real_timer);
132}
133 115
134void it_real_fn(unsigned long __data) 116void it_real_fn(unsigned long __data)
135{ 117{
136 struct task_struct * p = (struct task_struct *) __data; 118 struct task_struct * p = (struct task_struct *) __data;
119 unsigned long inc = p->signal->it_real_incr;
137 120
138 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); 121 send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
139 122
@@ -141,26 +124,42 @@ void it_real_fn(unsigned long __data)
141 * Now restart the timer if necessary. We don't need any locking 124 * Now restart the timer if necessary. We don't need any locking
142 * here because do_setitimer makes sure we have finished running 125 * here because do_setitimer makes sure we have finished running
143 * before it touches anything. 126 * before it touches anything.
127 * Note, we KNOW we are (or should be) at a jiffie edge here so
128 * we don't need the +1 stuff. Also, we want to use the prior
129 * expire value so as to not "slip" a jiffie if we are late.
130 * Deal with requesting a time prior to "now" here rather than
131 * in add_timer.
144 */ 132 */
145 it_real_arm(p, p->signal->it_real_incr); 133 if (!inc)
134 return;
135 while (time_before_eq(p->signal->real_timer.expires, jiffies))
136 p->signal->real_timer.expires += inc;
137 add_timer(&p->signal->real_timer);
146} 138}
147 139
148int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) 140int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
149{ 141{
150 struct task_struct *tsk = current; 142 struct task_struct *tsk = current;
151 unsigned long val, interval; 143 unsigned long val, interval, expires;
152 cputime_t cval, cinterval, nval, ninterval; 144 cputime_t cval, cinterval, nval, ninterval;
153 145
154 switch (which) { 146 switch (which) {
155 case ITIMER_REAL: 147 case ITIMER_REAL:
148again:
156 spin_lock_irq(&tsk->sighand->siglock); 149 spin_lock_irq(&tsk->sighand->siglock);
157 interval = tsk->signal->it_real_incr; 150 interval = tsk->signal->it_real_incr;
158 val = it_real_value(tsk->signal); 151 val = it_real_value(tsk->signal);
159 if (val) 152 /* We are sharing ->siglock with it_real_fn() */
160 del_timer_sync(&tsk->signal->real_timer); 153 if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
154 spin_unlock_irq(&tsk->sighand->siglock);
155 goto again;
156 }
161 tsk->signal->it_real_incr = 157 tsk->signal->it_real_incr =
162 timeval_to_jiffies(&value->it_interval); 158 timeval_to_jiffies(&value->it_interval);
163 it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); 159 expires = timeval_to_jiffies(&value->it_value);
160 if (expires)
161 mod_timer(&tsk->signal->real_timer,
162 jiffies + 1 + expires);
164 spin_unlock_irq(&tsk->sighand->siglock); 163 spin_unlock_irq(&tsk->sighand->siglock);
165 if (ovalue) { 164 if (ovalue) {
166 jiffies_to_timeval(val, &ovalue->it_value); 165 jiffies_to_timeval(val, &ovalue->it_value);
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..cdd4dcd8fb63
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
1/*
2 * kexec.c - kexec system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#include <linux/mm.h>
10#include <linux/file.h>
11#include <linux/slab.h>
12#include <linux/fs.h>
13#include <linux/kexec.h>
14#include <linux/spinlock.h>
15#include <linux/list.h>
16#include <linux/highmem.h>
17#include <linux/syscalls.h>
18#include <linux/reboot.h>
19#include <linux/syscalls.h>
20#include <linux/ioport.h>
21#include <linux/hardirq.h>
22
23#include <asm/page.h>
24#include <asm/uaccess.h>
25#include <asm/io.h>
26#include <asm/system.h>
27#include <asm/semaphore.h>
28
29/* Location of the reserved area for the crash kernel */
30struct resource crashk_res = {
31 .name = "Crash kernel",
32 .start = 0,
33 .end = 0,
34 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
35};
36
37int kexec_should_crash(struct task_struct *p)
38{
39 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
40 return 1;
41 return 0;
42}
43
44/*
45 * When kexec transitions to the new kernel there is a one-to-one
46 * mapping between physical and virtual addresses. On processors
47 * where you can disable the MMU this is trivial, and easy. For
48 * others it is still a simple predictable page table to setup.
49 *
50 * In that environment kexec copies the new kernel to its final
51 * resting place. This means I can only support memory whose
52 * physical address can fit in an unsigned long. In particular
53 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
54 * If the assembly stub has more restrictive requirements
55 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
56 * defined more restrictively in <asm/kexec.h>.
57 *
58 * The code for the transition from the current kernel to the
59 * the new kernel is placed in the control_code_buffer, whose size
60 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
61 * page of memory is necessary, but some architectures require more.
62 * Because this memory must be identity mapped in the transition from
63 * virtual to physical addresses it must live in the range
64 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
65 * modifiable.
66 *
67 * The assembly stub in the control code buffer is passed a linked list
68 * of descriptor pages detailing the source pages of the new kernel,
69 * and the destination addresses of those source pages. As this data
70 * structure is not used in the context of the current OS, it must
71 * be self-contained.
72 *
73 * The code has been made to work with highmem pages and will use a
74 * destination page in its final resting place (if it happens
75 * to allocate it). The end product of this is that most of the
76 * physical address space, and most of RAM can be used.
77 *
78 * Future directions include:
79 * - allocating a page table with the control code buffer identity
80 * mapped, to simplify machine_kexec and make kexec_on_panic more
81 * reliable.
82 */
83
84/*
85 * KIMAGE_NO_DEST is an impossible destination address..., for
86 * allocating pages whose destination address we do not care about.
87 */
88#define KIMAGE_NO_DEST (-1UL)
89
90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask,
94 unsigned long dest);
95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
97 unsigned long nr_segments,
98 struct kexec_segment __user *segments)
99{
100 size_t segment_bytes;
101 struct kimage *image;
102 unsigned long i;
103 int result;
104
105 /* Allocate a controlling structure */
106 result = -ENOMEM;
107 image = kmalloc(sizeof(*image), GFP_KERNEL);
108 if (!image)
109 goto out;
110
111 memset(image, 0, sizeof(*image));
112 image->head = 0;
113 image->entry = &image->head;
114 image->last_entry = &image->head;
115 image->control_page = ~0; /* By default this does not apply */
116 image->start = entry;
117 image->type = KEXEC_TYPE_DEFAULT;
118
119 /* Initialize the list of control pages */
120 INIT_LIST_HEAD(&image->control_pages);
121
122 /* Initialize the list of destination pages */
123 INIT_LIST_HEAD(&image->dest_pages);
124
125 /* Initialize the list of unuseable pages */
126 INIT_LIST_HEAD(&image->unuseable_pages);
127
128 /* Read in the segments */
129 image->nr_segments = nr_segments;
130 segment_bytes = nr_segments * sizeof(*segments);
131 result = copy_from_user(image->segment, segments, segment_bytes);
132 if (result)
133 goto out;
134
135 /*
136 * Verify we have good destination addresses. The caller is
137 * responsible for making certain we don't attempt to load
138 * the new image into invalid or reserved areas of RAM. This
139 * just verifies it is an address we can use.
140 *
141 * Since the kernel does everything in page size chunks ensure
142 * the destination addreses are page aligned. Too many
143 * special cases crop of when we don't do this. The most
144 * insidious is getting overlapping destination addresses
145 * simply because addresses are changed to page size
146 * granularity.
147 */
148 result = -EADDRNOTAVAIL;
149 for (i = 0; i < nr_segments; i++) {
150 unsigned long mstart, mend;
151
152 mstart = image->segment[i].mem;
153 mend = mstart + image->segment[i].memsz;
154 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
155 goto out;
156 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
157 goto out;
158 }
159
160 /* Verify our destination addresses do not overlap.
161 * If we alloed overlapping destination addresses
162 * through very weird things can happen with no
163 * easy explanation as one segment stops on another.
164 */
165 result = -EINVAL;
166 for (i = 0; i < nr_segments; i++) {
167 unsigned long mstart, mend;
168 unsigned long j;
169
170 mstart = image->segment[i].mem;
171 mend = mstart + image->segment[i].memsz;
172 for (j = 0; j < i; j++) {
173 unsigned long pstart, pend;
174 pstart = image->segment[j].mem;
175 pend = pstart + image->segment[j].memsz;
176 /* Do the segments overlap ? */
177 if ((mend > pstart) && (mstart < pend))
178 goto out;
179 }
180 }
181
182 /* Ensure our buffer sizes are strictly less than
183 * our memory sizes. This should always be the case,
184 * and it is easier to check up front than to be surprised
185 * later on.
186 */
187 result = -EINVAL;
188 for (i = 0; i < nr_segments; i++) {
189 if (image->segment[i].bufsz > image->segment[i].memsz)
190 goto out;
191 }
192
193 result = 0;
194out:
195 if (result == 0)
196 *rimage = image;
197 else
198 kfree(image);
199
200 return result;
201
202}
203
204static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
205 unsigned long nr_segments,
206 struct kexec_segment __user *segments)
207{
208 int result;
209 struct kimage *image;
210
211 /* Allocate and initialize a controlling structure */
212 image = NULL;
213 result = do_kimage_alloc(&image, entry, nr_segments, segments);
214 if (result)
215 goto out;
216
217 *rimage = image;
218
219 /*
220 * Find a location for the control code buffer, and add it
221 * the vector of segments so that it's pages will also be
222 * counted as destination pages.
223 */
224 result = -ENOMEM;
225 image->control_code_page = kimage_alloc_control_pages(image,
226 get_order(KEXEC_CONTROL_CODE_SIZE));
227 if (!image->control_code_page) {
228 printk(KERN_ERR "Could not allocate control_code_buffer\n");
229 goto out;
230 }
231
232 result = 0;
233 out:
234 if (result == 0)
235 *rimage = image;
236 else
237 kfree(image);
238
239 return result;
240}
241
242static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
243 unsigned long nr_segments,
244 struct kexec_segment __user *segments)
245{
246 int result;
247 struct kimage *image;
248 unsigned long i;
249
250 image = NULL;
251 /* Verify we have a valid entry point */
252 if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
253 result = -EADDRNOTAVAIL;
254 goto out;
255 }
256
257 /* Allocate and initialize a controlling structure */
258 result = do_kimage_alloc(&image, entry, nr_segments, segments);
259 if (result)
260 goto out;
261
262 /* Enable the special crash kernel control page
263 * allocation policy.
264 */
265 image->control_page = crashk_res.start;
266 image->type = KEXEC_TYPE_CRASH;
267
268 /*
269 * Verify we have good destination addresses. Normally
270 * the caller is responsible for making certain we don't
271 * attempt to load the new image into invalid or reserved
272 * areas of RAM. But crash kernels are preloaded into a
273 * reserved area of ram. We must ensure the addresses
274 * are in the reserved area otherwise preloading the
275 * kernel could corrupt things.
276 */
277 result = -EADDRNOTAVAIL;
278 for (i = 0; i < nr_segments; i++) {
279 unsigned long mstart, mend;
280
281 mstart = image->segment[i].mem;
282 mend = mstart + image->segment[i].memsz - 1;
283 /* Ensure we are within the crash kernel limits */
284 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
285 goto out;
286 }
287
288 /*
289 * Find a location for the control code buffer, and add
290 * the vector of segments so that it's pages will also be
291 * counted as destination pages.
292 */
293 result = -ENOMEM;
294 image->control_code_page = kimage_alloc_control_pages(image,
295 get_order(KEXEC_CONTROL_CODE_SIZE));
296 if (!image->control_code_page) {
297 printk(KERN_ERR "Could not allocate control_code_buffer\n");
298 goto out;
299 }
300
301 result = 0;
302out:
303 if (result == 0)
304 *rimage = image;
305 else
306 kfree(image);
307
308 return result;
309}
310
311static int kimage_is_destination_range(struct kimage *image,
312 unsigned long start,
313 unsigned long end)
314{
315 unsigned long i;
316
317 for (i = 0; i < image->nr_segments; i++) {
318 unsigned long mstart, mend;
319
320 mstart = image->segment[i].mem;
321 mend = mstart + image->segment[i].memsz;
322 if ((end > mstart) && (start < mend))
323 return 1;
324 }
325
326 return 0;
327}
328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask,
330 unsigned int order)
331{
332 struct page *pages;
333
334 pages = alloc_pages(gfp_mask, order);
335 if (pages) {
336 unsigned int count, i;
337 pages->mapping = NULL;
338 pages->private = order;
339 count = 1 << order;
340 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i);
342 }
343
344 return pages;
345}
346
347static void kimage_free_pages(struct page *page)
348{
349 unsigned int order, count, i;
350
351 order = page->private;
352 count = 1 << order;
353 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i);
355 __free_pages(page, order);
356}
357
358static void kimage_free_page_list(struct list_head *list)
359{
360 struct list_head *pos, *next;
361
362 list_for_each_safe(pos, next, list) {
363 struct page *page;
364
365 page = list_entry(pos, struct page, lru);
366 list_del(&page->lru);
367 kimage_free_pages(page);
368 }
369}
370
371static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
372 unsigned int order)
373{
374 /* Control pages are special, they are the intermediaries
375 * that are needed while we copy the rest of the pages
376 * to their final resting place. As such they must
377 * not conflict with either the destination addresses
378 * or memory the kernel is already using.
379 *
380 * The only case where we really need more than one of
381 * these are for architectures where we cannot disable
382 * the MMU and must instead generate an identity mapped
383 * page table for all of the memory.
384 *
385 * At worst this runs in O(N) of the image size.
386 */
387 struct list_head extra_pages;
388 struct page *pages;
389 unsigned int count;
390
391 count = 1 << order;
392 INIT_LIST_HEAD(&extra_pages);
393
394 /* Loop while I can allocate a page and the page allocated
395 * is a destination page.
396 */
397 do {
398 unsigned long pfn, epfn, addr, eaddr;
399
400 pages = kimage_alloc_pages(GFP_KERNEL, order);
401 if (!pages)
402 break;
403 pfn = page_to_pfn(pages);
404 epfn = pfn + count;
405 addr = pfn << PAGE_SHIFT;
406 eaddr = epfn << PAGE_SHIFT;
407 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
408 kimage_is_destination_range(image, addr, eaddr)) {
409 list_add(&pages->lru, &extra_pages);
410 pages = NULL;
411 }
412 } while (!pages);
413
414 if (pages) {
415 /* Remember the allocated page... */
416 list_add(&pages->lru, &image->control_pages);
417
418 /* Because the page is already in it's destination
419 * location we will never allocate another page at
420 * that address. Therefore kimage_alloc_pages
421 * will not return it (again) and we don't need
422 * to give it an entry in image->segment[].
423 */
424 }
425 /* Deal with the destination pages I have inadvertently allocated.
426 *
427 * Ideally I would convert multi-page allocations into single
428 * page allocations, and add everyting to image->dest_pages.
429 *
430 * For now it is simpler to just free the pages.
431 */
432 kimage_free_page_list(&extra_pages);
433
434 return pages;
435}
436
437static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
438 unsigned int order)
439{
440 /* Control pages are special, they are the intermediaries
441 * that are needed while we copy the rest of the pages
442 * to their final resting place. As such they must
443 * not conflict with either the destination addresses
444 * or memory the kernel is already using.
445 *
446 * Control pages are also the only pags we must allocate
447 * when loading a crash kernel. All of the other pages
448 * are specified by the segments and we just memcpy
449 * into them directly.
450 *
451 * The only case where we really need more than one of
452 * these are for architectures where we cannot disable
453 * the MMU and must instead generate an identity mapped
454 * page table for all of the memory.
455 *
456 * Given the low demand this implements a very simple
457 * allocator that finds the first hole of the appropriate
458 * size in the reserved memory region, and allocates all
459 * of the memory up to and including the hole.
460 */
461 unsigned long hole_start, hole_end, size;
462 struct page *pages;
463
464 pages = NULL;
465 size = (1 << order) << PAGE_SHIFT;
466 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
467 hole_end = hole_start + size - 1;
468 while (hole_end <= crashk_res.end) {
469 unsigned long i;
470
471 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
472 break;
473 if (hole_end > crashk_res.end)
474 break;
475 /* See if I overlap any of the segments */
476 for (i = 0; i < image->nr_segments; i++) {
477 unsigned long mstart, mend;
478
479 mstart = image->segment[i].mem;
480 mend = mstart + image->segment[i].memsz - 1;
481 if ((hole_end >= mstart) && (hole_start <= mend)) {
482 /* Advance the hole to the end of the segment */
483 hole_start = (mend + (size - 1)) & ~(size - 1);
484 hole_end = hole_start + size - 1;
485 break;
486 }
487 }
488 /* If I don't overlap any segments I have found my hole! */
489 if (i == image->nr_segments) {
490 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
491 break;
492 }
493 }
494 if (pages)
495 image->control_page = hole_end;
496
497 return pages;
498}
499
500
501struct page *kimage_alloc_control_pages(struct kimage *image,
502 unsigned int order)
503{
504 struct page *pages = NULL;
505
506 switch (image->type) {
507 case KEXEC_TYPE_DEFAULT:
508 pages = kimage_alloc_normal_control_pages(image, order);
509 break;
510 case KEXEC_TYPE_CRASH:
511 pages = kimage_alloc_crash_control_pages(image, order);
512 break;
513 }
514
515 return pages;
516}
517
518static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
519{
520 if (*image->entry != 0)
521 image->entry++;
522
523 if (image->entry == image->last_entry) {
524 kimage_entry_t *ind_page;
525 struct page *page;
526
527 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
528 if (!page)
529 return -ENOMEM;
530
531 ind_page = page_address(page);
532 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
533 image->entry = ind_page;
534 image->last_entry = ind_page +
535 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
536 }
537 *image->entry = entry;
538 image->entry++;
539 *image->entry = 0;
540
541 return 0;
542}
543
544static int kimage_set_destination(struct kimage *image,
545 unsigned long destination)
546{
547 int result;
548
549 destination &= PAGE_MASK;
550 result = kimage_add_entry(image, destination | IND_DESTINATION);
551 if (result == 0)
552 image->destination = destination;
553
554 return result;
555}
556
557
558static int kimage_add_page(struct kimage *image, unsigned long page)
559{
560 int result;
561
562 page &= PAGE_MASK;
563 result = kimage_add_entry(image, page | IND_SOURCE);
564 if (result == 0)
565 image->destination += PAGE_SIZE;
566
567 return result;
568}
569
570
571static void kimage_free_extra_pages(struct kimage *image)
572{
573 /* Walk through and free any extra destination pages I may have */
574 kimage_free_page_list(&image->dest_pages);
575
576 /* Walk through and free any unuseable pages I have cached */
577 kimage_free_page_list(&image->unuseable_pages);
578
579}
580static int kimage_terminate(struct kimage *image)
581{
582 if (*image->entry != 0)
583 image->entry++;
584
585 *image->entry = IND_DONE;
586
587 return 0;
588}
589
590#define for_each_kimage_entry(image, ptr, entry) \
591 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
592 ptr = (entry & IND_INDIRECTION)? \
593 phys_to_virt((entry & PAGE_MASK)): ptr +1)
594
595static void kimage_free_entry(kimage_entry_t entry)
596{
597 struct page *page;
598
599 page = pfn_to_page(entry >> PAGE_SHIFT);
600 kimage_free_pages(page);
601}
602
603static void kimage_free(struct kimage *image)
604{
605 kimage_entry_t *ptr, entry;
606 kimage_entry_t ind = 0;
607
608 if (!image)
609 return;
610
611 kimage_free_extra_pages(image);
612 for_each_kimage_entry(image, ptr, entry) {
613 if (entry & IND_INDIRECTION) {
614 /* Free the previous indirection page */
615 if (ind & IND_INDIRECTION)
616 kimage_free_entry(ind);
617 /* Save this indirection page until we are
618 * done with it.
619 */
620 ind = entry;
621 }
622 else if (entry & IND_SOURCE)
623 kimage_free_entry(entry);
624 }
625 /* Free the final indirection page */
626 if (ind & IND_INDIRECTION)
627 kimage_free_entry(ind);
628
629 /* Handle any machine specific cleanup */
630 machine_kexec_cleanup(image);
631
632 /* Free the kexec control pages... */
633 kimage_free_page_list(&image->control_pages);
634 kfree(image);
635}
636
637static kimage_entry_t *kimage_dst_used(struct kimage *image,
638 unsigned long page)
639{
640 kimage_entry_t *ptr, entry;
641 unsigned long destination = 0;
642
643 for_each_kimage_entry(image, ptr, entry) {
644 if (entry & IND_DESTINATION)
645 destination = entry & PAGE_MASK;
646 else if (entry & IND_SOURCE) {
647 if (page == destination)
648 return ptr;
649 destination += PAGE_SIZE;
650 }
651 }
652
653 return NULL;
654}
655
656static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask,
658 unsigned long destination)
659{
660 /*
661 * Here we implement safeguards to ensure that a source page
662 * is not copied to its destination page before the data on
663 * the destination page is no longer useful.
664 *
665 * To do this we maintain the invariant that a source page is
666 * either its own destination page, or it is not a
667 * destination page at all.
668 *
669 * That is slightly stronger than required, but the proof
670 * that no problems will not occur is trivial, and the
671 * implementation is simply to verify.
672 *
673 * When allocating all pages normally this algorithm will run
674 * in O(N) time, but in the worst case it will run in O(N^2)
675 * time. If the runtime is a problem the data structures can
676 * be fixed.
677 */
678 struct page *page;
679 unsigned long addr;
680
681 /*
682 * Walk through the list of destination pages, and see if I
683 * have a match.
684 */
685 list_for_each_entry(page, &image->dest_pages, lru) {
686 addr = page_to_pfn(page) << PAGE_SHIFT;
687 if (addr == destination) {
688 list_del(&page->lru);
689 return page;
690 }
691 }
692 page = NULL;
693 while (1) {
694 kimage_entry_t *old;
695
696 /* Allocate a page, if we run out of memory give up */
697 page = kimage_alloc_pages(gfp_mask, 0);
698 if (!page)
699 return NULL;
700 /* If the page cannot be used file it away */
701 if (page_to_pfn(page) >
702 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
703 list_add(&page->lru, &image->unuseable_pages);
704 continue;
705 }
706 addr = page_to_pfn(page) << PAGE_SHIFT;
707
708 /* If it is the destination page we want use it */
709 if (addr == destination)
710 break;
711
712 /* If the page is not a destination page use it */
713 if (!kimage_is_destination_range(image, addr,
714 addr + PAGE_SIZE))
715 break;
716
717 /*
718 * I know that the page is someones destination page.
719 * See if there is already a source page for this
720 * destination page. And if so swap the source pages.
721 */
722 old = kimage_dst_used(image, addr);
723 if (old) {
724 /* If so move it */
725 unsigned long old_addr;
726 struct page *old_page;
727
728 old_addr = *old & PAGE_MASK;
729 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
730 copy_highpage(page, old_page);
731 *old = addr | (*old & ~PAGE_MASK);
732
733 /* The old page I have found cannot be a
734 * destination page, so return it.
735 */
736 addr = old_addr;
737 page = old_page;
738 break;
739 }
740 else {
741 /* Place the page on the destination list I
742 * will use it later.
743 */
744 list_add(&page->lru, &image->dest_pages);
745 }
746 }
747
748 return page;
749}
750
751static int kimage_load_normal_segment(struct kimage *image,
752 struct kexec_segment *segment)
753{
754 unsigned long maddr;
755 unsigned long ubytes, mbytes;
756 int result;
757 unsigned char __user *buf;
758
759 result = 0;
760 buf = segment->buf;
761 ubytes = segment->bufsz;
762 mbytes = segment->memsz;
763 maddr = segment->mem;
764
765 result = kimage_set_destination(image, maddr);
766 if (result < 0)
767 goto out;
768
769 while (mbytes) {
770 struct page *page;
771 char *ptr;
772 size_t uchunk, mchunk;
773
774 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
775 if (page == 0) {
776 result = -ENOMEM;
777 goto out;
778 }
779 result = kimage_add_page(image, page_to_pfn(page)
780 << PAGE_SHIFT);
781 if (result < 0)
782 goto out;
783
784 ptr = kmap(page);
785 /* Start with a clear page */
786 memset(ptr, 0, PAGE_SIZE);
787 ptr += maddr & ~PAGE_MASK;
788 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
789 if (mchunk > mbytes)
790 mchunk = mbytes;
791
792 uchunk = mchunk;
793 if (uchunk > ubytes)
794 uchunk = ubytes;
795
796 result = copy_from_user(ptr, buf, uchunk);
797 kunmap(page);
798 if (result) {
799 result = (result < 0) ? result : -EIO;
800 goto out;
801 }
802 ubytes -= uchunk;
803 maddr += mchunk;
804 buf += mchunk;
805 mbytes -= mchunk;
806 }
807out:
808 return result;
809}
810
811static int kimage_load_crash_segment(struct kimage *image,
812 struct kexec_segment *segment)
813{
814 /* For crash dumps kernels we simply copy the data from
815 * user space to it's destination.
816 * We do things a page at a time for the sake of kmap.
817 */
818 unsigned long maddr;
819 unsigned long ubytes, mbytes;
820 int result;
821 unsigned char __user *buf;
822
823 result = 0;
824 buf = segment->buf;
825 ubytes = segment->bufsz;
826 mbytes = segment->memsz;
827 maddr = segment->mem;
828 while (mbytes) {
829 struct page *page;
830 char *ptr;
831 size_t uchunk, mchunk;
832
833 page = pfn_to_page(maddr >> PAGE_SHIFT);
834 if (page == 0) {
835 result = -ENOMEM;
836 goto out;
837 }
838 ptr = kmap(page);
839 ptr += maddr & ~PAGE_MASK;
840 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
841 if (mchunk > mbytes)
842 mchunk = mbytes;
843
844 uchunk = mchunk;
845 if (uchunk > ubytes) {
846 uchunk = ubytes;
847 /* Zero the trailing part of the page */
848 memset(ptr + uchunk, 0, mchunk - uchunk);
849 }
850 result = copy_from_user(ptr, buf, uchunk);
851 kunmap(page);
852 if (result) {
853 result = (result < 0) ? result : -EIO;
854 goto out;
855 }
856 ubytes -= uchunk;
857 maddr += mchunk;
858 buf += mchunk;
859 mbytes -= mchunk;
860 }
861out:
862 return result;
863}
864
865static int kimage_load_segment(struct kimage *image,
866 struct kexec_segment *segment)
867{
868 int result = -ENOMEM;
869
870 switch (image->type) {
871 case KEXEC_TYPE_DEFAULT:
872 result = kimage_load_normal_segment(image, segment);
873 break;
874 case KEXEC_TYPE_CRASH:
875 result = kimage_load_crash_segment(image, segment);
876 break;
877 }
878
879 return result;
880}
881
882/*
883 * Exec Kernel system call: for obvious reasons only root may call it.
884 *
885 * This call breaks up into three pieces.
886 * - A generic part which loads the new kernel from the current
887 * address space, and very carefully places the data in the
888 * allocated pages.
889 *
890 * - A generic part that interacts with the kernel and tells all of
891 * the devices to shut down. Preventing on-going dmas, and placing
892 * the devices in a consistent state so a later kernel can
893 * reinitialize them.
894 *
895 * - A machine specific part that includes the syscall number
896 * and the copies the image to it's final destination. And
897 * jumps into the image at entry.
898 *
899 * kexec does not sync, or unmount filesystems so if you need
900 * that to happen you need to do that yourself.
901 */
902struct kimage *kexec_image = NULL;
903static struct kimage *kexec_crash_image = NULL;
904/*
905 * A home grown binary mutex.
906 * Nothing can wait so this mutex is safe to use
907 * in interrupt context :)
908 */
909static int kexec_lock = 0;
910
911asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
912 struct kexec_segment __user *segments,
913 unsigned long flags)
914{
915 struct kimage **dest_image, *image;
916 int locked;
917 int result;
918
919 /* We only trust the superuser with rebooting the system. */
920 if (!capable(CAP_SYS_BOOT))
921 return -EPERM;
922
923 /*
924 * Verify we have a legal set of flags
925 * This leaves us room for future extensions.
926 */
927 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
928 return -EINVAL;
929
930 /* Verify we are on the appropriate architecture */
931 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
932 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
933 return -EINVAL;
934
935 /* Put an artificial cap on the number
936 * of segments passed to kexec_load.
937 */
938 if (nr_segments > KEXEC_SEGMENT_MAX)
939 return -EINVAL;
940
941 image = NULL;
942 result = 0;
943
944 /* Because we write directly to the reserved memory
945 * region when loading crash kernels we need a mutex here to
946 * prevent multiple crash kernels from attempting to load
947 * simultaneously, and to prevent a crash kernel from loading
948 * over the top of a in use crash kernel.
949 *
950 * KISS: always take the mutex.
951 */
952 locked = xchg(&kexec_lock, 1);
953 if (locked)
954 return -EBUSY;
955
956 dest_image = &kexec_image;
957 if (flags & KEXEC_ON_CRASH)
958 dest_image = &kexec_crash_image;
959 if (nr_segments > 0) {
960 unsigned long i;
961
962 /* Loading another kernel to reboot into */
963 if ((flags & KEXEC_ON_CRASH) == 0)
964 result = kimage_normal_alloc(&image, entry,
965 nr_segments, segments);
966 /* Loading another kernel to switch to if this one crashes */
967 else if (flags & KEXEC_ON_CRASH) {
968 /* Free any current crash dump kernel before
969 * we corrupt it.
970 */
971 kimage_free(xchg(&kexec_crash_image, NULL));
972 result = kimage_crash_alloc(&image, entry,
973 nr_segments, segments);
974 }
975 if (result)
976 goto out;
977
978 result = machine_kexec_prepare(image);
979 if (result)
980 goto out;
981
982 for (i = 0; i < nr_segments; i++) {
983 result = kimage_load_segment(image, &image->segment[i]);
984 if (result)
985 goto out;
986 }
987 result = kimage_terminate(image);
988 if (result)
989 goto out;
990 }
991 /* Install the new kernel, and Uninstall the old */
992 image = xchg(dest_image, image);
993
994out:
995 xchg(&kexec_lock, 0); /* Release the mutex */
996 kimage_free(image);
997
998 return result;
999}
1000
1001#ifdef CONFIG_COMPAT
1002asmlinkage long compat_sys_kexec_load(unsigned long entry,
1003 unsigned long nr_segments,
1004 struct compat_kexec_segment __user *segments,
1005 unsigned long flags)
1006{
1007 struct compat_kexec_segment in;
1008 struct kexec_segment out, __user *ksegments;
1009 unsigned long i, result;
1010
1011 /* Don't allow clients that don't understand the native
1012 * architecture to do anything.
1013 */
1014 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1015 return -EINVAL;
1016
1017 if (nr_segments > KEXEC_SEGMENT_MAX)
1018 return -EINVAL;
1019
1020 ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1021 for (i=0; i < nr_segments; i++) {
1022 result = copy_from_user(&in, &segments[i], sizeof(in));
1023 if (result)
1024 return -EFAULT;
1025
1026 out.buf = compat_ptr(in.buf);
1027 out.bufsz = in.bufsz;
1028 out.mem = in.mem;
1029 out.memsz = in.memsz;
1030
1031 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1032 if (result)
1033 return -EFAULT;
1034 }
1035
1036 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1037}
1038#endif
1039
1040void crash_kexec(struct pt_regs *regs)
1041{
1042 struct kimage *image;
1043 int locked;
1044
1045
1046 /* Take the kexec_lock here to prevent sys_kexec_load
1047 * running on one cpu from replacing the crash kernel
1048 * we are using after a panic on a different cpu.
1049 *
1050 * If the crash kernel was not located in a fixed area
1051 * of memory the xchg(&kexec_crash_image) would be
1052 * sufficient. But since I reuse the memory...
1053 */
1054 locked = xchg(&kexec_lock, 1);
1055 if (!locked) {
1056 image = xchg(&kexec_crash_image, NULL);
1057 if (image) {
1058 machine_crash_shutdown(regs);
1059 machine_kexec(image);
1060 }
1061 xchg(&kexec_lock, 0);
1062 }
1063}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
120 char *path; 120 char *path;
121 char **argv; 121 char **argv;
122 char **envp; 122 char **envp;
123 struct key *ring;
123 int wait; 124 int wait;
124 int retval; 125 int retval;
125}; 126};
@@ -130,16 +131,21 @@ struct subprocess_info {
130static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
131{ 132{
132 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session;
133 int retval; 135 int retval;
134 136
135 /* Unblock all signals. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring);
136 flush_signals(current); 139 flush_signals(current);
137 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring);
138 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
139 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
140 recalc_sigpending(); 144 recalc_sigpending();
141 spin_unlock_irq(&current->sighand->siglock); 145 spin_unlock_irq(&current->sighand->siglock);
142 146
147 key_put(old_session);
148
143 /* We can run anywhere, unlike our parent keventd(). */ 149 /* We can run anywhere, unlike our parent keventd(). */
144 set_cpus_allowed(current, CPU_MASK_ALL); 150 set_cpus_allowed(current, CPU_MASK_ALL);
145 151
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
211} 217}
212 218
213/** 219/**
214 * call_usermodehelper - start a usermode application 220 * call_usermodehelper_keys - start a usermode application
215 * @path: pathname for the application 221 * @path: pathname for the application
216 * @argv: null-terminated argument list 222 * @argv: null-terminated argument list
217 * @envp: null-terminated environment list 223 * @envp: null-terminated environment list
224 * @session_keyring: session keyring for process (NULL for an empty keyring)
218 * @wait: wait for the application to finish and return status. 225 * @wait: wait for the application to finish and return status.
219 * 226 *
220 * Runs a user-space application. The application is started 227 * Runs a user-space application. The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
224 * Must be called from process context. Returns a negative error code 231 * Must be called from process context. Returns a negative error code
225 * if program was not execed successfully, or 0. 232 * if program was not execed successfully, or 0.
226 */ 233 */
227int call_usermodehelper(char *path, char **argv, char **envp, int wait) 234int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait)
228{ 236{
229 DECLARE_COMPLETION(done); 237 DECLARE_COMPLETION(done);
230 struct subprocess_info sub_info = { 238 struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
232 .path = path, 240 .path = path,
233 .argv = argv, 241 .argv = argv,
234 .envp = envp, 242 .envp = envp,
243 .ring = session_keyring,
235 .wait = wait, 244 .wait = wait,
236 .retval = 0, 245 .retval = 0,
237 }; 246 };
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
247 wait_for_completion(&done); 256 wait_for_completion(&done);
248 return sub_info.retval; 257 return sub_info.retval;
249} 258}
250EXPORT_SYMBOL(call_usermodehelper); 259EXPORT_SYMBOL(call_usermodehelper_keys);
251 260
252void __init usermodehelper_init(void) 261void __init usermodehelper_init(void)
253{ 262{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..b0237122b24e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,12 +27,16 @@
27 * interface to access function arguments. 27 * interface to access function arguments.
28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes 28 * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
29 * exceptions notifier to be first on the priority list. 29 * exceptions notifier to be first on the priority list.
30 * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
31 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
32 * <prasanna@in.ibm.com> added function-return probes.
30 */ 33 */
31#include <linux/kprobes.h> 34#include <linux/kprobes.h>
32#include <linux/spinlock.h> 35#include <linux/spinlock.h>
33#include <linux/hash.h> 36#include <linux/hash.h>
34#include <linux/init.h> 37#include <linux/init.h>
35#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h>
36#include <asm/cacheflush.h> 40#include <asm/cacheflush.h>
37#include <asm/errno.h> 41#include <asm/errno.h>
38#include <asm/kdebug.h> 42#include <asm/kdebug.h>
@@ -41,11 +45,112 @@
41#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 45#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
42 46
43static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 47static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
48static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
44 49
45unsigned int kprobe_cpu = NR_CPUS; 50unsigned int kprobe_cpu = NR_CPUS;
46static DEFINE_SPINLOCK(kprobe_lock); 51static DEFINE_SPINLOCK(kprobe_lock);
47static struct kprobe *curr_kprobe; 52static struct kprobe *curr_kprobe;
48 53
54/*
55 * kprobe->ainsn.insn points to the copy of the instruction to be
56 * single-stepped. x86_64, POWER4 and above have no-exec support and
57 * stepping on the instruction on a vmalloced/kmalloced/data page
58 * is a recipe for disaster
59 */
60#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
61
62struct kprobe_insn_page {
63 struct hlist_node hlist;
64 kprobe_opcode_t *insns; /* Page of instruction slots */
65 char slot_used[INSNS_PER_PAGE];
66 int nused;
67};
68
69static struct hlist_head kprobe_insn_pages;
70
71/**
72 * get_insn_slot() - Find a slot on an executable page for an instruction.
73 * We allocate an executable page if there's no room on existing ones.
74 */
75kprobe_opcode_t *get_insn_slot(void)
76{
77 struct kprobe_insn_page *kip;
78 struct hlist_node *pos;
79
80 hlist_for_each(pos, &kprobe_insn_pages) {
81 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
82 if (kip->nused < INSNS_PER_PAGE) {
83 int i;
84 for (i = 0; i < INSNS_PER_PAGE; i++) {
85 if (!kip->slot_used[i]) {
86 kip->slot_used[i] = 1;
87 kip->nused++;
88 return kip->insns + (i * MAX_INSN_SIZE);
89 }
90 }
91 /* Surprise! No unused slots. Fix kip->nused. */
92 kip->nused = INSNS_PER_PAGE;
93 }
94 }
95
96 /* All out of space. Need to allocate a new page. Use slot 0.*/
97 kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
98 if (!kip) {
99 return NULL;
100 }
101
102 /*
103 * Use module_alloc so this page is within +/- 2GB of where the
104 * kernel image and loaded module images reside. This is required
105 * so x86_64 can correctly handle the %rip-relative fixups.
106 */
107 kip->insns = module_alloc(PAGE_SIZE);
108 if (!kip->insns) {
109 kfree(kip);
110 return NULL;
111 }
112 INIT_HLIST_NODE(&kip->hlist);
113 hlist_add_head(&kip->hlist, &kprobe_insn_pages);
114 memset(kip->slot_used, 0, INSNS_PER_PAGE);
115 kip->slot_used[0] = 1;
116 kip->nused = 1;
117 return kip->insns;
118}
119
120void free_insn_slot(kprobe_opcode_t *slot)
121{
122 struct kprobe_insn_page *kip;
123 struct hlist_node *pos;
124
125 hlist_for_each(pos, &kprobe_insn_pages) {
126 kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
127 if (kip->insns <= slot &&
128 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
129 int i = (slot - kip->insns) / MAX_INSN_SIZE;
130 kip->slot_used[i] = 0;
131 kip->nused--;
132 if (kip->nused == 0) {
133 /*
134 * Page is no longer in use. Free it unless
135 * it's the last one. We keep the last one
136 * so as not to have to set it up again the
137 * next time somebody inserts a probe.
138 */
139 hlist_del(&kip->hlist);
140 if (hlist_empty(&kprobe_insn_pages)) {
141 INIT_HLIST_NODE(&kip->hlist);
142 hlist_add_head(&kip->hlist,
143 &kprobe_insn_pages);
144 } else {
145 module_free(NULL, kip->insns);
146 kfree(kip);
147 }
148 }
149 return;
150 }
151 }
152}
153
49/* Locks kprobe: irqs must be disabled */ 154/* Locks kprobe: irqs must be disabled */
50void lock_kprobes(void) 155void lock_kprobes(void)
51{ 156{
@@ -78,22 +183,23 @@ struct kprobe *get_kprobe(void *addr)
78 * Aggregate handlers for multiple kprobes support - these handlers 183 * Aggregate handlers for multiple kprobes support - these handlers
79 * take care of invoking the individual kprobe handlers on p->list 184 * take care of invoking the individual kprobe handlers on p->list
80 */ 185 */
81int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 186static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
82{ 187{
83 struct kprobe *kp; 188 struct kprobe *kp;
84 189
85 list_for_each_entry(kp, &p->list, list) { 190 list_for_each_entry(kp, &p->list, list) {
86 if (kp->pre_handler) { 191 if (kp->pre_handler) {
87 curr_kprobe = kp; 192 curr_kprobe = kp;
88 kp->pre_handler(kp, regs); 193 if (kp->pre_handler(kp, regs))
89 curr_kprobe = NULL; 194 return 1;
90 } 195 }
196 curr_kprobe = NULL;
91 } 197 }
92 return 0; 198 return 0;
93} 199}
94 200
95void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 201static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
96 unsigned long flags) 202 unsigned long flags)
97{ 203{
98 struct kprobe *kp; 204 struct kprobe *kp;
99 205
@@ -107,7 +213,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
107 return; 213 return;
108} 214}
109 215
110int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) 216static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
217 int trapnr)
111{ 218{
112 /* 219 /*
113 * if we faulted "during" the execution of a user specified 220 * if we faulted "during" the execution of a user specified
@@ -120,19 +227,159 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
120 return 0; 227 return 0;
121} 228}
122 229
230static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
231{
232 struct kprobe *kp = curr_kprobe;
233 if (curr_kprobe && kp->break_handler) {
234 if (kp->break_handler(kp, regs)) {
235 curr_kprobe = NULL;
236 return 1;
237 }
238 }
239 curr_kprobe = NULL;
240 return 0;
241}
242
243struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
244{
245 struct hlist_node *node;
246 struct kretprobe_instance *ri;
247 hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
248 return ri;
249 return NULL;
250}
251
252static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
253{
254 struct hlist_node *node;
255 struct kretprobe_instance *ri;
256 hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
257 return ri;
258 return NULL;
259}
260
261void add_rp_inst(struct kretprobe_instance *ri)
262{
263 /*
264 * Remove rp inst off the free list -
265 * Add it back when probed function returns
266 */
267 hlist_del(&ri->uflist);
268
269 /* Add rp inst onto table */
270 INIT_HLIST_NODE(&ri->hlist);
271 hlist_add_head(&ri->hlist,
272 &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
273
274 /* Also add this rp inst to the used list. */
275 INIT_HLIST_NODE(&ri->uflist);
276 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
277}
278
279void recycle_rp_inst(struct kretprobe_instance *ri)
280{
281 /* remove rp inst off the rprobe_inst_table */
282 hlist_del(&ri->hlist);
283 if (ri->rp) {
284 /* remove rp inst off the used list */
285 hlist_del(&ri->uflist);
286 /* put rp inst back onto the free list */
287 INIT_HLIST_NODE(&ri->uflist);
288 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
289 } else
290 /* Unregistering */
291 kfree(ri);
292}
293
294struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
295{
296 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
297}
298
299/*
300 * This function is called from exit_thread or flush_thread when task tk's
301 * stack is being recycled so that we can recycle any function-return probe
302 * instances associated with this task. These left over instances represent
303 * probed functions that have been called but will never return.
304 */
305void kprobe_flush_task(struct task_struct *tk)
306{
307 struct kretprobe_instance *ri;
308 struct hlist_head *head;
309 struct hlist_node *node, *tmp;
310 unsigned long flags = 0;
311
312 spin_lock_irqsave(&kprobe_lock, flags);
313 head = kretprobe_inst_table_head(current);
314 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
315 if (ri->task == tk)
316 recycle_rp_inst(ri);
317 }
318 spin_unlock_irqrestore(&kprobe_lock, flags);
319}
320
321/*
322 * This kprobe pre_handler is registered with every kretprobe. When probe
323 * hits it will set up the return probe.
324 */
325static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
326{
327 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
328
329 /*TODO: consider to only swap the RA after the last pre_handler fired */
330 arch_prepare_kretprobe(rp, regs);
331 return 0;
332}
333
334static inline void free_rp_inst(struct kretprobe *rp)
335{
336 struct kretprobe_instance *ri;
337 while ((ri = get_free_rp_inst(rp)) != NULL) {
338 hlist_del(&ri->uflist);
339 kfree(ri);
340 }
341}
342
343/*
344 * Keep all fields in the kprobe consistent
345 */
346static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
347{
348 memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
349 memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
350}
351
352/*
353* Add the new probe to old_p->list. Fail if this is the
354* second jprobe at the address - two jprobes can't coexist
355*/
356static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
357{
358 struct kprobe *kp;
359
360 if (p->break_handler) {
361 list_for_each_entry(kp, &old_p->list, list) {
362 if (kp->break_handler)
363 return -EEXIST;
364 }
365 list_add_tail(&p->list, &old_p->list);
366 } else
367 list_add(&p->list, &old_p->list);
368 return 0;
369}
370
123/* 371/*
124 * Fill in the required fields of the "manager kprobe". Replace the 372 * Fill in the required fields of the "manager kprobe". Replace the
125 * earlier kprobe in the hlist with the manager kprobe 373 * earlier kprobe in the hlist with the manager kprobe
126 */ 374 */
127static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 375static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
128{ 376{
377 copy_kprobe(p, ap);
129 ap->addr = p->addr; 378 ap->addr = p->addr;
130 ap->opcode = p->opcode;
131 memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
132
133 ap->pre_handler = aggr_pre_handler; 379 ap->pre_handler = aggr_pre_handler;
134 ap->post_handler = aggr_post_handler; 380 ap->post_handler = aggr_post_handler;
135 ap->fault_handler = aggr_fault_handler; 381 ap->fault_handler = aggr_fault_handler;
382 ap->break_handler = aggr_break_handler;
136 383
137 INIT_LIST_HEAD(&ap->list); 384 INIT_LIST_HEAD(&ap->list);
138 list_add(&p->list, &ap->list); 385 list_add(&p->list, &ap->list);
@@ -153,16 +400,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
153 int ret = 0; 400 int ret = 0;
154 struct kprobe *ap; 401 struct kprobe *ap;
155 402
156 if (old_p->break_handler || p->break_handler) { 403 if (old_p->pre_handler == aggr_pre_handler) {
157 ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ 404 copy_kprobe(old_p, p);
158 } else if (old_p->pre_handler == aggr_pre_handler) { 405 ret = add_new_kprobe(old_p, p);
159 list_add(&p->list, &old_p->list);
160 } else { 406 } else {
161 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); 407 ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
162 if (!ap) 408 if (!ap)
163 return -ENOMEM; 409 return -ENOMEM;
164 add_aggr_kprobe(ap, old_p); 410 add_aggr_kprobe(ap, old_p);
165 list_add(&p->list, &ap->list); 411 copy_kprobe(ap, p);
412 ret = add_new_kprobe(ap, p);
166 } 413 }
167 return ret; 414 return ret;
168} 415}
@@ -170,10 +417,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
170/* kprobe removal house-keeping routines */ 417/* kprobe removal house-keeping routines */
171static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) 418static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
172{ 419{
173 *p->addr = p->opcode; 420 arch_disarm_kprobe(p);
174 hlist_del(&p->hlist); 421 hlist_del(&p->hlist);
175 flush_icache_range((unsigned long) p->addr,
176 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
177 spin_unlock_irqrestore(&kprobe_lock, flags); 422 spin_unlock_irqrestore(&kprobe_lock, flags);
178 arch_remove_kprobe(p); 423 arch_remove_kprobe(p);
179} 424}
@@ -200,6 +445,7 @@ int register_kprobe(struct kprobe *p)
200 } 445 }
201 spin_lock_irqsave(&kprobe_lock, flags); 446 spin_lock_irqsave(&kprobe_lock, flags);
202 old_p = get_kprobe(p->addr); 447 old_p = get_kprobe(p->addr);
448 p->nmissed = 0;
203 if (old_p) { 449 if (old_p) {
204 ret = register_aggr_kprobe(old_p, p); 450 ret = register_aggr_kprobe(old_p, p);
205 goto out; 451 goto out;
@@ -210,10 +456,8 @@ int register_kprobe(struct kprobe *p)
210 hlist_add_head(&p->hlist, 456 hlist_add_head(&p->hlist,
211 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 457 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
212 458
213 p->opcode = *p->addr; 459 arch_arm_kprobe(p);
214 *p->addr = BREAKPOINT_INSTRUCTION; 460
215 flush_icache_range((unsigned long) p->addr,
216 (unsigned long) p->addr + sizeof(kprobe_opcode_t));
217out: 461out:
218 spin_unlock_irqrestore(&kprobe_lock, flags); 462 spin_unlock_irqrestore(&kprobe_lock, flags);
219rm_kprobe: 463rm_kprobe:
@@ -257,16 +501,83 @@ void unregister_jprobe(struct jprobe *jp)
257 unregister_kprobe(&jp->kp); 501 unregister_kprobe(&jp->kp);
258} 502}
259 503
504#ifdef ARCH_SUPPORTS_KRETPROBES
505
506int register_kretprobe(struct kretprobe *rp)
507{
508 int ret = 0;
509 struct kretprobe_instance *inst;
510 int i;
511
512 rp->kp.pre_handler = pre_handler_kretprobe;
513
514 /* Pre-allocate memory for max kretprobe instances */
515 if (rp->maxactive <= 0) {
516#ifdef CONFIG_PREEMPT
517 rp->maxactive = max(10, 2 * NR_CPUS);
518#else
519 rp->maxactive = NR_CPUS;
520#endif
521 }
522 INIT_HLIST_HEAD(&rp->used_instances);
523 INIT_HLIST_HEAD(&rp->free_instances);
524 for (i = 0; i < rp->maxactive; i++) {
525 inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
526 if (inst == NULL) {
527 free_rp_inst(rp);
528 return -ENOMEM;
529 }
530 INIT_HLIST_NODE(&inst->uflist);
531 hlist_add_head(&inst->uflist, &rp->free_instances);
532 }
533
534 rp->nmissed = 0;
535 /* Establish function entry probe point */
536 if ((ret = register_kprobe(&rp->kp)) != 0)
537 free_rp_inst(rp);
538 return ret;
539}
540
541#else /* ARCH_SUPPORTS_KRETPROBES */
542
543int register_kretprobe(struct kretprobe *rp)
544{
545 return -ENOSYS;
546}
547
548#endif /* ARCH_SUPPORTS_KRETPROBES */
549
550void unregister_kretprobe(struct kretprobe *rp)
551{
552 unsigned long flags;
553 struct kretprobe_instance *ri;
554
555 unregister_kprobe(&rp->kp);
556 /* No race here */
557 spin_lock_irqsave(&kprobe_lock, flags);
558 free_rp_inst(rp);
559 while ((ri = get_used_rp_inst(rp)) != NULL) {
560 ri->rp = NULL;
561 hlist_del(&ri->uflist);
562 }
563 spin_unlock_irqrestore(&kprobe_lock, flags);
564}
565
260static int __init init_kprobes(void) 566static int __init init_kprobes(void)
261{ 567{
262 int i, err = 0; 568 int i, err = 0;
263 569
264 /* FIXME allocate the probe table, currently defined statically */ 570 /* FIXME allocate the probe table, currently defined statically */
265 /* initialize all list heads */ 571 /* initialize all list heads */
266 for (i = 0; i < KPROBE_TABLE_SIZE; i++) 572 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
267 INIT_HLIST_HEAD(&kprobe_table[i]); 573 INIT_HLIST_HEAD(&kprobe_table[i]);
574 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
575 }
576
577 err = arch_init_kprobes();
578 if (!err)
579 err = register_die_notifier(&kprobe_exceptions_nb);
268 580
269 err = register_die_notifier(&kprobe_exceptions_nb);
270 return err; 581 return err;
271} 582}
272 583
@@ -277,3 +588,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
277EXPORT_SYMBOL_GPL(register_jprobe); 588EXPORT_SYMBOL_GPL(register_jprobe);
278EXPORT_SYMBOL_GPL(unregister_jprobe); 589EXPORT_SYMBOL_GPL(unregister_jprobe);
279EXPORT_SYMBOL_GPL(jprobe_return); 590EXPORT_SYMBOL_GPL(jprobe_return);
591EXPORT_SYMBOL_GPL(register_kretprobe);
592EXPORT_SYMBOL_GPL(unregister_kretprobe);
593
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
30KERNEL_ATTR_RO(hotplug_seqnum); 30KERNEL_ATTR_RO(hotplug_seqnum);
31#endif 31#endif
32 32
33#ifdef CONFIG_KEXEC
34#include <asm/kexec.h>
35
36static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
37{
38 return sprintf(page, "%p\n", (void *)crash_notes);
39}
40KERNEL_ATTR_RO(crash_notes);
41#endif
42
33decl_subsys(kernel, NULL, NULL); 43decl_subsys(kernel, NULL, NULL);
34EXPORT_SYMBOL_GPL(kernel_subsys); 44EXPORT_SYMBOL_GPL(kernel_subsys);
35 45
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
37#ifdef CONFIG_HOTPLUG 47#ifdef CONFIG_HOTPLUG
38 &hotplug_seqnum_attr.attr, 48 &hotplug_seqnum_attr.attr,
39#endif 49#endif
50#ifdef CONFIG_KEXEC
51 &crash_notes_attr.attr,
52#endif
40 NULL 53 NULL
41}; 54};
42 55
diff --git a/kernel/module.c b/kernel/module.c
index a566745dde62..c32995fbd8fd 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
35#include <linux/notifier.h> 35#include <linux/notifier.h>
36#include <linux/stop_machine.h> 36#include <linux/stop_machine.h>
37#include <linux/device.h> 37#include <linux/device.h>
38#include <linux/string.h>
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39#include <asm/semaphore.h> 40#include <asm/semaphore.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
@@ -249,13 +250,18 @@ static inline unsigned int block_size(int val)
249/* Created by linker magic */ 250/* Created by linker magic */
250extern char __per_cpu_start[], __per_cpu_end[]; 251extern char __per_cpu_start[], __per_cpu_end[];
251 252
252static void *percpu_modalloc(unsigned long size, unsigned long align) 253static void *percpu_modalloc(unsigned long size, unsigned long align,
254 const char *name)
253{ 255{
254 unsigned long extra; 256 unsigned long extra;
255 unsigned int i; 257 unsigned int i;
256 void *ptr; 258 void *ptr;
257 259
258 BUG_ON(align > SMP_CACHE_BYTES); 260 if (align > SMP_CACHE_BYTES) {
261 printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
262 name, align, SMP_CACHE_BYTES);
263 align = SMP_CACHE_BYTES;
264 }
259 265
260 ptr = __per_cpu_start; 266 ptr = __per_cpu_start;
261 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { 267 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -347,7 +353,8 @@ static int percpu_modinit(void)
347} 353}
348__initcall(percpu_modinit); 354__initcall(percpu_modinit);
349#else /* ... !CONFIG_SMP */ 355#else /* ... !CONFIG_SMP */
350static inline void *percpu_modalloc(unsigned long size, unsigned long align) 356static inline void *percpu_modalloc(unsigned long size, unsigned long align,
357 const char *name)
351{ 358{
352 return NULL; 359 return NULL;
353} 360}
@@ -370,6 +377,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
370#endif /* CONFIG_SMP */ 377#endif /* CONFIG_SMP */
371 378
372#ifdef CONFIG_MODULE_UNLOAD 379#ifdef CONFIG_MODULE_UNLOAD
380#define MODINFO_ATTR(field) \
381static void setup_modinfo_##field(struct module *mod, const char *s) \
382{ \
383 mod->field = kstrdup(s, GFP_KERNEL); \
384} \
385static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
386 struct module *mod, char *buffer) \
387{ \
388 return sprintf(buffer, "%s\n", mod->field); \
389} \
390static int modinfo_##field##_exists(struct module *mod) \
391{ \
392 return mod->field != NULL; \
393} \
394static void free_modinfo_##field(struct module *mod) \
395{ \
396 kfree(mod->field); \
397 mod->field = NULL; \
398} \
399static struct module_attribute modinfo_##field = { \
400 .attr = { .name = __stringify(field), .mode = 0444, \
401 .owner = THIS_MODULE }, \
402 .show = show_modinfo_##field, \
403 .setup = setup_modinfo_##field, \
404 .test = modinfo_##field##_exists, \
405 .free = free_modinfo_##field, \
406};
407
408MODINFO_ATTR(version);
409MODINFO_ATTR(srcversion);
410
411static struct module_attribute *modinfo_attrs[] = {
412 &modinfo_version,
413 &modinfo_srcversion,
414 NULL,
415};
416
373/* Init the unload section of the module. */ 417/* Init the unload section of the module. */
374static void module_unload_init(struct module *mod) 418static void module_unload_init(struct module *mod)
375{ 419{
@@ -692,7 +736,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
692 return 0; 736 return 0;
693} 737}
694 738
695int set_obsolete(const char *val, struct kernel_param *kp) 739static int set_obsolete(const char *val, struct kernel_param *kp)
696{ 740{
697 unsigned int min, max; 741 unsigned int min, max;
698 unsigned int size, maxsize; 742 unsigned int size, maxsize;
@@ -1031,6 +1075,32 @@ static void module_remove_refcnt_attr(struct module *mod)
1031} 1075}
1032#endif 1076#endif
1033 1077
1078#ifdef CONFIG_MODULE_UNLOAD
1079static int module_add_modinfo_attrs(struct module *mod)
1080{
1081 struct module_attribute *attr;
1082 int error = 0;
1083 int i;
1084
1085 for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
1086 if (!attr->test ||
1087 (attr->test && attr->test(mod)))
1088 error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
1089 }
1090 return error;
1091}
1092
1093static void module_remove_modinfo_attrs(struct module *mod)
1094{
1095 struct module_attribute *attr;
1096 int i;
1097
1098 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1099 sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
1100 attr->free(mod);
1101 }
1102}
1103#endif
1034 1104
1035static int mod_sysfs_setup(struct module *mod, 1105static int mod_sysfs_setup(struct module *mod,
1036 struct kernel_param *kparam, 1106 struct kernel_param *kparam,
@@ -1056,6 +1126,12 @@ static int mod_sysfs_setup(struct module *mod,
1056 if (err) 1126 if (err)
1057 goto out_unreg; 1127 goto out_unreg;
1058 1128
1129#ifdef CONFIG_MODULE_UNLOAD
1130 err = module_add_modinfo_attrs(mod);
1131 if (err)
1132 goto out_unreg;
1133#endif
1134
1059 return 0; 1135 return 0;
1060 1136
1061out_unreg: 1137out_unreg:
@@ -1066,6 +1142,9 @@ out:
1066 1142
1067static void mod_kobject_remove(struct module *mod) 1143static void mod_kobject_remove(struct module *mod)
1068{ 1144{
1145#ifdef CONFIG_MODULE_UNLOAD
1146 module_remove_modinfo_attrs(mod);
1147#endif
1069 module_remove_refcnt_attr(mod); 1148 module_remove_refcnt_attr(mod);
1070 module_param_sysfs_remove(mod); 1149 module_param_sysfs_remove(mod);
1071 1150
@@ -1311,6 +1390,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
1311 return NULL; 1390 return NULL;
1312} 1391}
1313 1392
1393#ifdef CONFIG_MODULE_UNLOAD
1394static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1395 unsigned int infoindex)
1396{
1397 struct module_attribute *attr;
1398 int i;
1399
1400 for (i = 0; (attr = modinfo_attrs[i]); i++) {
1401 if (attr->setup)
1402 attr->setup(mod,
1403 get_modinfo(sechdrs,
1404 infoindex,
1405 attr->attr.name));
1406 }
1407}
1408#endif
1409
1314#ifdef CONFIG_KALLSYMS 1410#ifdef CONFIG_KALLSYMS
1315int is_exported(const char *name, const struct module *mod) 1411int is_exported(const char *name, const struct module *mod)
1316{ 1412{
@@ -1554,7 +1650,8 @@ static struct module *load_module(void __user *umod,
1554 if (pcpuindex) { 1650 if (pcpuindex) {
1555 /* We have a special allocation for this section. */ 1651 /* We have a special allocation for this section. */
1556 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 1652 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
1557 sechdrs[pcpuindex].sh_addralign); 1653 sechdrs[pcpuindex].sh_addralign,
1654 mod->name);
1558 if (!percpu) { 1655 if (!percpu) {
1559 err = -ENOMEM; 1656 err = -ENOMEM;
1560 goto free_mod; 1657 goto free_mod;
@@ -1615,6 +1712,11 @@ static struct module *load_module(void __user *umod,
1615 /* Set up license info based on the info section */ 1712 /* Set up license info based on the info section */
1616 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1713 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1617 1714
1715#ifdef CONFIG_MODULE_UNLOAD
1716 /* Set up MODINFO_ATTR fields */
1717 setup_modinfo(mod, sechdrs, infoindex);
1718#endif
1719
1618 /* Fix up syms, so that st_value is a pointer to location. */ 1720 /* Fix up syms, so that st_value is a pointer to location. */
1619 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, 1721 err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
1620 mod); 1722 mod);
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..aabc5f86fa3f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
18#include <linux/sysrq.h> 18#include <linux/sysrq.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/nmi.h> 20#include <linux/nmi.h>
21#include <linux/kexec.h>
21 22
22int panic_timeout; 23int panic_timeout;
23int panic_on_oops; 24int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
63 unsigned long caller = (unsigned long) __builtin_return_address(0); 64 unsigned long caller = (unsigned long) __builtin_return_address(0);
64#endif 65#endif
65 66
67 /*
68 * It's possible to come here directly from a panic-assertion and not
69 * have preempt disabled. Some functions called from here want
70 * preempt to be disabled. No point enabling it later though...
71 */
72 preempt_disable();
73
66 bust_spinlocks(1); 74 bust_spinlocks(1);
67 va_start(args, fmt); 75 va_start(args, fmt);
68 vsnprintf(buf, sizeof(buf), fmt, args); 76 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
70 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); 78 printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
71 bust_spinlocks(0); 79 bust_spinlocks(0);
72 80
81 /*
82 * If we have crashed and we have a crash kernel loaded let it handle
83 * everything else.
84 * Do we want to call this before we try to display a message?
85 */
86 crash_kexec(NULL);
87
73#ifdef CONFIG_SMP 88#ifdef CONFIG_SMP
89 /*
90 * Note smp_send_stop is the usual smp shutdown function, which
91 * unfortunately means it may not be hardened to work in a panic
92 * situation.
93 */
74 smp_send_stop(); 94 smp_send_stop();
75#endif 95#endif
76 96
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
79 if (!panic_blink) 99 if (!panic_blink)
80 panic_blink = no_blink; 100 panic_blink = no_blink;
81 101
82 if (panic_timeout > 0) 102 if (panic_timeout > 0) {
83 {
84 /* 103 /*
85 * Delay timeout seconds before rebooting the machine. 104 * Delay timeout seconds before rebooting the machine.
86 * We can't use the "normal" timers since we just panicked.. 105 * We can't use the "normal" timers since we just panicked..
@@ -92,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...)
92 mdelay(1); 111 mdelay(1);
93 i++; 112 i++;
94 } 113 }
95 /* 114 /* This will not be a clean reboot, with everything
96 * Should we run the reboot notifier. For the moment Im 115 * shutting down. But if there is a chance of
97 * choosing not too. It might crash, be corrupt or do 116 * rebooting the system it will be rebooted.
98 * more harm than good for other reasons.
99 */ 117 */
100 machine_restart(NULL); 118 emergency_restart();
101 } 119 }
102#ifdef __sparc__ 120#ifdef __sparc__
103 { 121 {
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index cabb63fc9e16..38798a2ff994 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -89,23 +89,6 @@ static struct idr posix_timers_id;
89static DEFINE_SPINLOCK(idr_lock); 89static DEFINE_SPINLOCK(idr_lock);
90 90
91/* 91/*
92 * Just because the timer is not in the timer list does NOT mean it is
93 * inactive. It could be in the "fire" routine getting a new expire time.
94 */
95#define TIMER_INACTIVE 1
96
97#ifdef CONFIG_SMP
98# define timer_active(tmr) \
99 ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
100# define set_timer_inactive(tmr) \
101 do { \
102 (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
103 } while (0)
104#else
105# define timer_active(tmr) BARFY // error to use outside of SMP
106# define set_timer_inactive(tmr) do { } while (0)
107#endif
108/*
109 * we assume that the new SIGEV_THREAD_ID shares no bits with the other 92 * we assume that the new SIGEV_THREAD_ID shares no bits with the other
110 * SIGEV values. Here we put out an error if this assumption fails. 93 * SIGEV values. Here we put out an error if this assumption fails.
111 */ 94 */
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
226 init_timer(&new_timer->it.real.timer); 209 init_timer(&new_timer->it.real.timer);
227 new_timer->it.real.timer.data = (unsigned long) new_timer; 210 new_timer->it.real.timer.data = (unsigned long) new_timer;
228 new_timer->it.real.timer.function = posix_timer_fn; 211 new_timer->it.real.timer.function = posix_timer_fn;
229 set_timer_inactive(new_timer);
230 return 0; 212 return 0;
231} 213}
232 214
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
480 int do_notify = 1; 462 int do_notify = 1;
481 463
482 spin_lock_irqsave(&timr->it_lock, flags); 464 spin_lock_irqsave(&timr->it_lock, flags);
483 set_timer_inactive(timr);
484 if (!list_empty(&timr->it.real.abs_timer_entry)) { 465 if (!list_empty(&timr->it.real.abs_timer_entry)) {
485 spin_lock(&abs_list.lock); 466 spin_lock(&abs_list.lock);
486 do { 467 do {
@@ -915,21 +896,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
915 jiffies_64_f = get_jiffies_64(); 896 jiffies_64_f = get_jiffies_64();
916 } 897 }
917 /* 898 /*
918 * Take away now to get delta 899 * Take away now to get delta and normalize
919 */
920 oc.tv_sec -= now.tv_sec;
921 oc.tv_nsec -= now.tv_nsec;
922 /*
923 * Normalize...
924 */ 900 */
925 while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { 901 set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
926 oc.tv_nsec -= NSEC_PER_SEC; 902 oc.tv_nsec - now.tv_nsec);
927 oc.tv_sec++;
928 }
929 while ((oc.tv_nsec) < 0) {
930 oc.tv_nsec += NSEC_PER_SEC;
931 oc.tv_sec--;
932 }
933 }else{ 903 }else{
934 jiffies_64_f = get_jiffies_64(); 904 jiffies_64_f = get_jiffies_64();
935 } 905 }
@@ -983,8 +953,8 @@ common_timer_set(struct k_itimer *timr, int flags,
983 * careful here. If smp we could be in the "fire" routine which will 953 * careful here. If smp we could be in the "fire" routine which will
984 * be spinning as we hold the lock. But this is ONLY an SMP issue. 954 * be spinning as we hold the lock. But this is ONLY an SMP issue.
985 */ 955 */
956 if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
986#ifdef CONFIG_SMP 957#ifdef CONFIG_SMP
987 if (timer_active(timr) && !del_timer(&timr->it.real.timer))
988 /* 958 /*
989 * It can only be active if on an other cpu. Since 959 * It can only be active if on an other cpu. Since
990 * we have cleared the interval stuff above, it should 960 * we have cleared the interval stuff above, it should
@@ -994,11 +964,9 @@ common_timer_set(struct k_itimer *timr, int flags,
994 * a "retry" exit status. 964 * a "retry" exit status.
995 */ 965 */
996 return TIMER_RETRY; 966 return TIMER_RETRY;
997
998 set_timer_inactive(timr);
999#else
1000 del_timer(&timr->it.real.timer);
1001#endif 967#endif
968 }
969
1002 remove_from_abslist(timr); 970 remove_from_abslist(timr);
1003 971
1004 timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 972 timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
@@ -1083,8 +1051,9 @@ retry:
1083static inline int common_timer_del(struct k_itimer *timer) 1051static inline int common_timer_del(struct k_itimer *timer)
1084{ 1052{
1085 timer->it.real.incr = 0; 1053 timer->it.real.incr = 0;
1054
1055 if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
1086#ifdef CONFIG_SMP 1056#ifdef CONFIG_SMP
1087 if (timer_active(timer) && !del_timer(&timer->it.real.timer))
1088 /* 1057 /*
1089 * It can only be active if on an other cpu. Since 1058 * It can only be active if on an other cpu. Since
1090 * we have cleared the interval stuff above, it should 1059 * we have cleared the interval stuff above, it should
@@ -1094,9 +1063,9 @@ static inline int common_timer_del(struct k_itimer *timer)
1094 * a "retry" exit status. 1063 * a "retry" exit status.
1095 */ 1064 */
1096 return TIMER_RETRY; 1065 return TIMER_RETRY;
1097#else
1098 del_timer(&timer->it.real.timer);
1099#endif 1066#endif
1067 }
1068
1100 remove_from_abslist(timer); 1069 remove_from_abslist(timer);
1101 1070
1102 return 0; 1071 return 0;
@@ -1197,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig)
1197 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); 1166 tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
1198 itimer_delete(tmr); 1167 itimer_delete(tmr);
1199 } 1168 }
1200 del_timer_sync(&sig->real_timer);
1201} 1169}
1202 1170
1203/* 1171/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
27 like suspend support. 27 like suspend support.
28 28
29config SOFTWARE_SUSPEND 29config SOFTWARE_SUSPEND
30 bool "Software Suspend (EXPERIMENTAL)" 30 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP 31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
32 ---help--- 32 ---help---
33 Enable the possibility of suspending the machine. 33 Enable the possibility of suspending the machine.
34 It doesn't need APM. 34 It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
72 suspended image to. It will simply pick the first available swap 72 suspended image to. It will simply pick the first available swap
73 device. 73 device.
74 74
75config SUSPEND_SMP
76 bool
77 depends on HOTPLUG_CPU && X86 && PM
78 default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6swsusp-smp-$(CONFIG_SMP) += smp.o
7
8obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o pm.o
9obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o 7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
8
9obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 10
11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 11obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..664eb0469b6e 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -16,6 +16,8 @@
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h>
20
19#include "power.h" 21#include "power.h"
20 22
21 23
@@ -57,16 +59,13 @@ static void power_down(suspend_disk_method_t mode)
57 error = pm_ops->enter(PM_SUSPEND_DISK); 59 error = pm_ops->enter(PM_SUSPEND_DISK);
58 break; 60 break;
59 case PM_DISK_SHUTDOWN: 61 case PM_DISK_SHUTDOWN:
60 printk("Powering off system\n"); 62 kernel_power_off();
61 device_shutdown();
62 machine_power_off();
63 break; 63 break;
64 case PM_DISK_REBOOT: 64 case PM_DISK_REBOOT:
65 device_shutdown(); 65 kernel_restart(NULL);
66 machine_restart(NULL);
67 break; 66 break;
68 } 67 }
69 machine_halt(); 68 kernel_halt();
70 /* Valid image is on the disk, if we continue we risk serious data corruption 69 /* Valid image is on the disk, if we continue we risk serious data corruption
71 after resume. */ 70 after resume. */
72 printk(KERN_CRIT "Please power me down manually\n"); 71 printk(KERN_CRIT "Please power me down manually\n");
@@ -117,8 +116,8 @@ static void finish(void)
117{ 116{
118 device_resume(); 117 device_resume();
119 platform_finish(); 118 platform_finish();
120 enable_nonboot_cpus();
121 thaw_processes(); 119 thaw_processes();
120 enable_nonboot_cpus();
122 pm_restore_console(); 121 pm_restore_console();
123} 122}
124 123
@@ -131,28 +130,35 @@ static int prepare_processes(void)
131 130
132 sys_sync(); 131 sys_sync();
133 132
133 disable_nonboot_cpus();
134
134 if (freeze_processes()) { 135 if (freeze_processes()) {
135 error = -EBUSY; 136 error = -EBUSY;
136 return error; 137 goto thaw;
137 } 138 }
138 139
139 if (pm_disk_mode == PM_DISK_PLATFORM) { 140 if (pm_disk_mode == PM_DISK_PLATFORM) {
140 if (pm_ops && pm_ops->prepare) { 141 if (pm_ops && pm_ops->prepare) {
141 if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) 142 if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
142 return error; 143 goto thaw;
143 } 144 }
144 } 145 }
145 146
146 /* Free memory before shutting down devices. */ 147 /* Free memory before shutting down devices. */
147 free_some_memory(); 148 free_some_memory();
148
149 return 0; 149 return 0;
150thaw:
151 thaw_processes();
152 enable_nonboot_cpus();
153 pm_restore_console();
154 return error;
150} 155}
151 156
152static void unprepare_processes(void) 157static void unprepare_processes(void)
153{ 158{
154 enable_nonboot_cpus(); 159 platform_finish();
155 thaw_processes(); 160 thaw_processes();
161 enable_nonboot_cpus();
156 pm_restore_console(); 162 pm_restore_console();
157} 163}
158 164
@@ -160,15 +166,9 @@ static int prepare_devices(void)
160{ 166{
161 int error; 167 int error;
162 168
163 disable_nonboot_cpus(); 169 if ((error = device_suspend(PMSG_FREEZE)))
164 if ((error = device_suspend(PMSG_FREEZE))) {
165 printk("Some devices failed to suspend\n"); 170 printk("Some devices failed to suspend\n");
166 platform_finish(); 171 return error;
167 enable_nonboot_cpus();
168 return error;
169 }
170
171 return 0;
172} 172}
173 173
174/** 174/**
@@ -185,9 +185,9 @@ int pm_suspend_disk(void)
185 int error; 185 int error;
186 186
187 error = prepare_processes(); 187 error = prepare_processes();
188 if (!error) { 188 if (error)
189 error = prepare_devices(); 189 return error;
190 } 190 error = prepare_devices();
191 191
192 if (error) { 192 if (error) {
193 unprepare_processes(); 193 unprepare_processes();
@@ -233,6 +233,16 @@ static int software_resume(void)
233{ 233{
234 int error; 234 int error;
235 235
236 if (!swsusp_resume_device) {
237 if (!strlen(resume_file))
238 return -ENOENT;
239 swsusp_resume_device = name_to_dev_t(resume_file);
240 pr_debug("swsusp: Resume From Partition %s\n", resume_file);
241 } else {
242 pr_debug("swsusp: Resume From Partition %d:%d\n",
243 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
244 }
245
236 if (noresume) { 246 if (noresume) {
237 /** 247 /**
238 * FIXME: If noresume is specified, we need to find the partition 248 * FIXME: If noresume is specified, we need to find the partition
@@ -250,7 +260,7 @@ static int software_resume(void)
250 260
251 if ((error = prepare_processes())) { 261 if ((error = prepare_processes())) {
252 swsusp_close(); 262 swsusp_close();
253 goto Cleanup; 263 goto Done;
254 } 264 }
255 265
256 pr_debug("PM: Reading swsusp image.\n"); 266 pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..71aa0fd22007 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -19,6 +19,9 @@
19 19
20#include "power.h" 20#include "power.h"
21 21
22/*This is just an arbitrary number */
23#define FREE_PAGE_NUMBER (100)
24
22DECLARE_MUTEX(pm_sem); 25DECLARE_MUTEX(pm_sem);
23 26
24struct pm_ops * pm_ops = NULL; 27struct pm_ops * pm_ops = NULL;
@@ -49,17 +52,35 @@ void pm_set_ops(struct pm_ops * ops)
49static int suspend_prepare(suspend_state_t state) 52static int suspend_prepare(suspend_state_t state)
50{ 53{
51 int error = 0; 54 int error = 0;
55 unsigned int free_pages;
52 56
53 if (!pm_ops || !pm_ops->enter) 57 if (!pm_ops || !pm_ops->enter)
54 return -EPERM; 58 return -EPERM;
55 59
56 pm_prepare_console(); 60 pm_prepare_console();
57 61
62 disable_nonboot_cpus();
63
64 if (num_online_cpus() != 1) {
65 error = -EPERM;
66 goto Enable_cpu;
67 }
68
58 if (freeze_processes()) { 69 if (freeze_processes()) {
59 error = -EAGAIN; 70 error = -EAGAIN;
60 goto Thaw; 71 goto Thaw;
61 } 72 }
62 73
74 if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) {
75 pr_debug("PM: free some memory\n");
76 shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
77 if (nr_free_pages() < FREE_PAGE_NUMBER) {
78 error = -ENOMEM;
79 printk(KERN_ERR "PM: No enough memory\n");
80 goto Thaw;
81 }
82 }
83
63 if (pm_ops->prepare) { 84 if (pm_ops->prepare) {
64 if ((error = pm_ops->prepare(state))) 85 if ((error = pm_ops->prepare(state)))
65 goto Thaw; 86 goto Thaw;
@@ -75,6 +96,8 @@ static int suspend_prepare(suspend_state_t state)
75 pm_ops->finish(state); 96 pm_ops->finish(state);
76 Thaw: 97 Thaw:
77 thaw_processes(); 98 thaw_processes();
99 Enable_cpu:
100 enable_nonboot_cpus();
78 pm_restore_console(); 101 pm_restore_console();
79 return error; 102 return error;
80} 103}
@@ -113,6 +136,7 @@ static void suspend_finish(suspend_state_t state)
113 if (pm_ops && pm_ops->finish) 136 if (pm_ops && pm_ops->finish)
114 pm_ops->finish(state); 137 pm_ops->finish(state);
115 thaw_processes(); 138 thaw_processes();
139 enable_nonboot_cpus();
116 pm_restore_console(); 140 pm_restore_console();
117} 141}
118 142
@@ -150,12 +174,6 @@ static int enter_state(suspend_state_t state)
150 goto Unlock; 174 goto Unlock;
151 } 175 }
152 176
153 /* Suspend is hard to get right on SMP. */
154 if (num_online_cpus() != 1) {
155 error = -EPERM;
156 goto Unlock;
157 }
158
159 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 177 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
160 if ((error = suspend_prepare(state))) 178 if ((error = suspend_prepare(state)))
161 goto Unlock; 179 goto Unlock;
@@ -190,7 +208,7 @@ int software_suspend(void)
190 208
191int pm_suspend(suspend_state_t state) 209int pm_suspend(suspend_state_t state)
192{ 210{
193 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) 211 if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
194 return enter_state(state); 212 return enter_state(state);
195 return -EINVAL; 213 return -EINVAL;
196} 214}
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 715081b2d829..7a4144ba3afd 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -9,6 +9,7 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/workqueue.h> 11#include <linux/workqueue.h>
12#include <linux/reboot.h>
12 13
13/* 14/*
14 * When the user hits Sys-Rq o to power down the machine this is the 15 * When the user hits Sys-Rq o to power down the machine this is the
@@ -17,8 +18,7 @@
17 18
18static void do_poweroff(void *dummy) 19static void do_poweroff(void *dummy)
19{ 20{
20 if (pm_power_off) 21 kernel_power_off();
21 pm_power_off();
22} 22}
23 23
24static DECLARE_WORK(poweroff_work, do_poweroff, NULL); 24static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..3bd0d261818f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
32} 32}
33 33
34/* Refrigerator is place where frozen processes are stored :-). */ 34/* Refrigerator is place where frozen processes are stored :-). */
35void refrigerator(unsigned long flag) 35void refrigerator(void)
36{ 36{
37 /* Hmm, should we be allowed to suspend when there are realtime 37 /* Hmm, should we be allowed to suspend when there are realtime
38 processes around? */ 38 processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
41 current->state = TASK_UNINTERRUPTIBLE; 41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 42 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 43 printk("=");
44 current->flags &= ~PF_FREEZE;
45 44
45 frozen_process(current);
46 spin_lock_irq(&current->sighand->siglock); 46 spin_lock_irq(&current->sighand->siglock);
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 47 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 48 spin_unlock_irq(&current->sighand->siglock);
49 49
50 current->flags |= PF_FROZEN; 50 while (frozen(current))
51 while (current->flags & PF_FROZEN)
52 schedule(); 51 schedule();
53 pr_debug("%s left refrigerator\n", current->comm); 52 pr_debug("%s left refrigerator\n", current->comm);
54 current->state = save; 53 current->state = save;
@@ -57,27 +56,23 @@ void refrigerator(unsigned long flag)
57/* 0 = success, else # of processes that we failed to stop */ 56/* 0 = success, else # of processes that we failed to stop */
58int freeze_processes(void) 57int freeze_processes(void)
59{ 58{
60 int todo; 59 int todo;
61 unsigned long start_time; 60 unsigned long start_time;
62 struct task_struct *g, *p; 61 struct task_struct *g, *p;
63 62 unsigned long flags;
63
64 printk( "Stopping tasks: " ); 64 printk( "Stopping tasks: " );
65 start_time = jiffies; 65 start_time = jiffies;
66 do { 66 do {
67 todo = 0; 67 todo = 0;
68 read_lock(&tasklist_lock); 68 read_lock(&tasklist_lock);
69 do_each_thread(g, p) { 69 do_each_thread(g, p) {
70 unsigned long flags;
71 if (!freezeable(p)) 70 if (!freezeable(p))
72 continue; 71 continue;
73 if ((p->flags & PF_FROZEN) || 72 if (frozen(p))
74 (p->state == TASK_TRACED) ||
75 (p->state == TASK_STOPPED))
76 continue; 73 continue;
77 74
78 /* FIXME: smp problem here: we may not access other process' flags 75 freeze(p);
79 without locking */
80 p->flags |= PF_FREEZE;
81 spin_lock_irqsave(&p->sighand->siglock, flags); 76 spin_lock_irqsave(&p->sighand->siglock, flags);
82 signal_wake_up(p, 0); 77 signal_wake_up(p, 0);
83 spin_unlock_irqrestore(&p->sighand->siglock, flags); 78 spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +86,7 @@ int freeze_processes(void)
91 return todo; 86 return todo;
92 } 87 }
93 } while(todo); 88 } while(todo);
94 89
95 printk( "|\n" ); 90 printk( "|\n" );
96 BUG_ON(in_atomic()); 91 BUG_ON(in_atomic());
97 return 0; 92 return 0;
@@ -106,10 +101,7 @@ void thaw_processes(void)
106 do_each_thread(g, p) { 101 do_each_thread(g, p) {
107 if (!freezeable(p)) 102 if (!freezeable(p))
108 continue; 103 continue;
109 if (p->flags & PF_FROZEN) { 104 if (!thaw_process(p))
110 p->flags &= ~PF_FROZEN;
111 wake_up_process(p);
112 } else
113 printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); 105 printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
114 } while_each_thread(g, p); 106 } while_each_thread(g, p);
115 107
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 457c2302ed42..911fc62b8225 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/suspend.h> 14#include <linux/suspend.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/cpu.h>
16#include <asm/atomic.h> 17#include <asm/atomic.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19static atomic_t cpu_counter, freeze; 20/* This is protected by pm_sem semaphore */
20 21static cpumask_t frozen_cpus;
21
22static void smp_pause(void * data)
23{
24 struct saved_context ctxt;
25 __save_processor_state(&ctxt);
26 printk("Sleeping in:\n");
27 dump_stack();
28 atomic_inc(&cpu_counter);
29 while (atomic_read(&freeze)) {
30 /* FIXME: restore takes place at random piece inside this.
31 This should probably be written in assembly, and
32 preserve general-purpose registers, too
33
34 What about stack? We may need to move to new stack here.
35
36 This should better be ran with interrupts disabled.
37 */
38 cpu_relax();
39 barrier();
40 }
41 atomic_dec(&cpu_counter);
42 __restore_processor_state(&ctxt);
43}
44
45static cpumask_t oldmask;
46 22
47void disable_nonboot_cpus(void) 23void disable_nonboot_cpus(void)
48{ 24{
49 oldmask = current->cpus_allowed; 25 int cpu, error;
50 set_cpus_allowed(current, cpumask_of_cpu(0));
51 printk("Freezing CPUs (at %d)", raw_smp_processor_id());
52 current->state = TASK_INTERRUPTIBLE;
53 schedule_timeout(HZ);
54 printk("...");
55 BUG_ON(raw_smp_processor_id() != 0);
56 26
57 /* FIXME: for this to work, all the CPUs must be running 27 error = 0;
58 * "idle" thread (or we deadlock). Is that guaranteed? */ 28 cpus_clear(frozen_cpus);
59 29 printk("Freezing cpus ...\n");
60 atomic_set(&cpu_counter, 0); 30 for_each_online_cpu(cpu) {
61 atomic_set(&freeze, 1); 31 if (cpu == 0)
62 smp_call_function(smp_pause, NULL, 0, 0); 32 continue;
63 while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { 33 error = cpu_down(cpu);
64 cpu_relax(); 34 if (!error) {
65 barrier(); 35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
66 } 40 }
67 printk("ok\n"); 41 BUG_ON(raw_smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
68} 44}
69 45
70void enable_nonboot_cpus(void) 46void enable_nonboot_cpus(void)
71{ 47{
72 printk("Restarting CPUs"); 48 int cpu, error;
73 atomic_set(&freeze, 0);
74 while (atomic_read(&cpu_counter)) {
75 cpu_relax();
76 barrier();
77 }
78 printk("...");
79 set_cpus_allowed(current, oldmask);
80 schedule();
81 printk("ok\n");
82 49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = smp_prepare_cpu(cpu);
53 if (!error)
54 error = cpu_up(cpu);
55 if (!error) {
56 printk("CPU%d is up\n", cpu);
57 continue;
58 }
59 printk("Error taking cpu %d up: %d\n", cpu, error);
60 panic("Not enough cpus");
61 }
62 cpus_clear(frozen_cpus);
83} 63}
84 64
85
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..f2bc71b9fe8b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
11 * 11 *
12 * I'd like to thank the following people for their work: 12 * I'd like to thank the following people for their work:
13 * 13 *
14 * Pavel Machek <pavel@ucw.cz>: 14 * Pavel Machek <pavel@ucw.cz>:
15 * Modifications, defectiveness pointing, being with me at the very beginning, 15 * Modifications, defectiveness pointing, being with me at the very beginning,
16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. 16 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
17 * 17 *
18 * Steve Doddi <dirk@loth.demon.co.uk>: 18 * Steve Doddi <dirk@loth.demon.co.uk>:
19 * Support the possibility of hardware state restoring. 19 * Support the possibility of hardware state restoring.
20 * 20 *
21 * Raph <grey.havens@earthling.net>: 21 * Raph <grey.havens@earthling.net>:
@@ -63,6 +63,7 @@
63#include <linux/console.h> 63#include <linux/console.h>
64#include <linux/highmem.h> 64#include <linux/highmem.h>
65#include <linux/bio.h> 65#include <linux/bio.h>
66#include <linux/mount.h>
66 67
67#include <asm/uaccess.h> 68#include <asm/uaccess.h>
68#include <asm/mmu_context.h> 69#include <asm/mmu_context.h>
@@ -81,14 +82,14 @@ static int nr_copy_pages_check;
81extern char resume_file[]; 82extern char resume_file[];
82 83
83/* Local variables that should not be affected by save */ 84/* Local variables that should not be affected by save */
84unsigned int nr_copy_pages __nosavedata = 0; 85static unsigned int nr_copy_pages __nosavedata = 0;
85 86
86/* Suspend pagedir is allocated before final copy, therefore it 87/* Suspend pagedir is allocated before final copy, therefore it
87 must be freed after resume 88 must be freed after resume
88 89
89 Warning: this is evil. There are actually two pagedirs at time of 90 Warning: this is evil. There are actually two pagedirs at time of
90 resume. One is "pagedir_save", which is empty frame allocated at 91 resume. One is "pagedir_save", which is empty frame allocated at
91 time of suspend, that must be freed. Second is "pagedir_nosave", 92 time of suspend, that must be freed. Second is "pagedir_nosave",
92 allocated at time of resume, that travels through memory not to 93 allocated at time of resume, that travels through memory not to
93 collide with anything. 94 collide with anything.
94 95
@@ -132,7 +133,7 @@ static int mark_swapfiles(swp_entry_t prev)
132{ 133{
133 int error; 134 int error;
134 135
135 rw_swap_page_sync(READ, 136 rw_swap_page_sync(READ,
136 swp_entry(root_swap, 0), 137 swp_entry(root_swap, 0),
137 virt_to_page((unsigned long)&swsusp_header)); 138 virt_to_page((unsigned long)&swsusp_header));
138 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 139 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +141,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 141 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
141 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 142 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
142 swsusp_header.swsusp_info = prev; 143 swsusp_header.swsusp_info = prev;
143 error = rw_swap_page_sync(WRITE, 144 error = rw_swap_page_sync(WRITE,
144 swp_entry(root_swap, 0), 145 swp_entry(root_swap, 0),
145 virt_to_page((unsigned long) 146 virt_to_page((unsigned long)
146 &swsusp_header)); 147 &swsusp_header));
@@ -174,22 +175,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
174static int swsusp_swap_check(void) /* This is called before saving image */ 175static int swsusp_swap_check(void) /* This is called before saving image */
175{ 176{
176 int i, len; 177 int i, len;
177 178
178 len=strlen(resume_file); 179 len=strlen(resume_file);
179 root_swap = 0xFFFF; 180 root_swap = 0xFFFF;
180 181
181 swap_list_lock(); 182 swap_list_lock();
182 for(i=0; i<MAX_SWAPFILES; i++) { 183 for (i=0; i<MAX_SWAPFILES; i++) {
183 if (swap_info[i].flags == 0) { 184 if (swap_info[i].flags == 0) {
184 swapfile_used[i]=SWAPFILE_UNUSED; 185 swapfile_used[i]=SWAPFILE_UNUSED;
185 } else { 186 } else {
186 if(!len) { 187 if (!len) {
187 printk(KERN_WARNING "resume= option should be used to set suspend device" ); 188 printk(KERN_WARNING "resume= option should be used to set suspend device" );
188 if(root_swap == 0xFFFF) { 189 if (root_swap == 0xFFFF) {
189 swapfile_used[i] = SWAPFILE_SUSPEND; 190 swapfile_used[i] = SWAPFILE_SUSPEND;
190 root_swap = i; 191 root_swap = i;
191 } else 192 } else
192 swapfile_used[i] = SWAPFILE_IGNORED; 193 swapfile_used[i] = SWAPFILE_IGNORED;
193 } else { 194 } else {
194 /* we ignore all swap devices that are not the resume_file */ 195 /* we ignore all swap devices that are not the resume_file */
195 if (is_resume_device(&swap_info[i])) { 196 if (is_resume_device(&swap_info[i])) {
@@ -209,15 +210,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
209 * This is called after saving image so modification 210 * This is called after saving image so modification
210 * will be lost after resume... and that's what we want. 211 * will be lost after resume... and that's what we want.
211 * we make the device unusable. A new call to 212 * we make the device unusable. A new call to
212 * lock_swapdevices can unlock the devices. 213 * lock_swapdevices can unlock the devices.
213 */ 214 */
214static void lock_swapdevices(void) 215static void lock_swapdevices(void)
215{ 216{
216 int i; 217 int i;
217 218
218 swap_list_lock(); 219 swap_list_lock();
219 for(i = 0; i< MAX_SWAPFILES; i++) 220 for (i = 0; i< MAX_SWAPFILES; i++)
220 if(swapfile_used[i] == SWAPFILE_IGNORED) { 221 if (swapfile_used[i] == SWAPFILE_IGNORED) {
221 swap_info[i].flags ^= 0xFF; 222 swap_info[i].flags ^= 0xFF;
222 } 223 }
223 swap_list_unlock(); 224 swap_list_unlock();
@@ -229,7 +230,7 @@ static void lock_swapdevices(void)
229 * @loc: Place to store the entry we used. 230 * @loc: Place to store the entry we used.
230 * 231 *
231 * Allocate a new swap entry and 'sync' it. Note we discard -EIO 232 * Allocate a new swap entry and 'sync' it. Note we discard -EIO
232 * errors. That is an artifact left over from swsusp. It did not 233 * errors. That is an artifact left over from swsusp. It did not
233 * check the return of rw_swap_page_sync() at all, since most pages 234 * check the return of rw_swap_page_sync() at all, since most pages
234 * written back to swap would return -EIO. 235 * written back to swap would return -EIO.
235 * This is a partial improvement, since we will at least return other 236 * This is a partial improvement, since we will at least return other
@@ -241,7 +242,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
241 int error = 0; 242 int error = 0;
242 243
243 entry = get_swap_page(); 244 entry = get_swap_page();
244 if (swp_offset(entry) && 245 if (swp_offset(entry) &&
245 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { 246 swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
246 error = rw_swap_page_sync(WRITE, entry, 247 error = rw_swap_page_sync(WRITE, entry,
247 virt_to_page(addr)); 248 virt_to_page(addr));
@@ -257,7 +258,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
257/** 258/**
258 * data_free - Free the swap entries used by the saved image. 259 * data_free - Free the swap entries used by the saved image.
259 * 260 *
260 * Walk the list of used swap entries and free each one. 261 * Walk the list of used swap entries and free each one.
261 * This is only used for cleanup when suspend fails. 262 * This is only used for cleanup when suspend fails.
262 */ 263 */
263static void data_free(void) 264static void data_free(void)
@@ -290,7 +291,7 @@ static int data_write(void)
290 mod = 1; 291 mod = 1;
291 292
292 printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); 293 printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
293 for_each_pbe(p, pagedir_nosave) { 294 for_each_pbe (p, pagedir_nosave) {
294 if (!(i%mod)) 295 if (!(i%mod))
295 printk( "\b\b\b\b%3d%%", i / mod ); 296 printk( "\b\b\b\b%3d%%", i / mod );
296 if ((error = write_page(p->address, &(p->swap_address)))) 297 if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +336,7 @@ static int close_swap(void)
335 336
336 dump_info(); 337 dump_info();
337 error = write_page((unsigned long)&swsusp_info, &entry); 338 error = write_page((unsigned long)&swsusp_info, &entry);
338 if (!error) { 339 if (!error) {
339 printk( "S" ); 340 printk( "S" );
340 error = mark_swapfiles(entry); 341 error = mark_swapfiles(entry);
341 printk( "|\n" ); 342 printk( "|\n" );
@@ -370,7 +371,7 @@ static int write_pagedir(void)
370 struct pbe * pbe; 371 struct pbe * pbe;
371 372
372 printk( "Writing pagedir..."); 373 printk( "Writing pagedir...");
373 for_each_pb_page(pbe, pagedir_nosave) { 374 for_each_pb_page (pbe, pagedir_nosave) {
374 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) 375 if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
375 return error; 376 return error;
376 } 377 }
@@ -472,7 +473,7 @@ static int save_highmem(void)
472 int res = 0; 473 int res = 0;
473 474
474 pr_debug("swsusp: Saving Highmem\n"); 475 pr_debug("swsusp: Saving Highmem\n");
475 for_each_zone(zone) { 476 for_each_zone (zone) {
476 if (is_highmem(zone)) 477 if (is_highmem(zone))
477 res = save_highmem_zone(zone); 478 res = save_highmem_zone(zone);
478 if (res) 479 if (res)
@@ -547,7 +548,7 @@ static void count_data_pages(void)
547 548
548 nr_copy_pages = 0; 549 nr_copy_pages = 0;
549 550
550 for_each_zone(zone) { 551 for_each_zone (zone) {
551 if (is_highmem(zone)) 552 if (is_highmem(zone))
552 continue; 553 continue;
553 mark_free_pages(zone); 554 mark_free_pages(zone);
@@ -562,9 +563,9 @@ static void copy_data_pages(void)
562 struct zone *zone; 563 struct zone *zone;
563 unsigned long zone_pfn; 564 unsigned long zone_pfn;
564 struct pbe * pbe = pagedir_nosave; 565 struct pbe * pbe = pagedir_nosave;
565 566
566 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); 567 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
567 for_each_zone(zone) { 568 for_each_zone (zone) {
568 if (is_highmem(zone)) 569 if (is_highmem(zone))
569 continue; 570 continue;
570 mark_free_pages(zone); 571 mark_free_pages(zone);
@@ -702,7 +703,7 @@ static void free_image_pages(void)
702{ 703{
703 struct pbe * p; 704 struct pbe * p;
704 705
705 for_each_pbe(p, pagedir_save) { 706 for_each_pbe (p, pagedir_save) {
706 if (p->address) { 707 if (p->address) {
707 ClearPageNosave(virt_to_page(p->address)); 708 ClearPageNosave(virt_to_page(p->address));
708 free_page(p->address); 709 free_page(p->address);
@@ -719,7 +720,7 @@ static int alloc_image_pages(void)
719{ 720{
720 struct pbe * p; 721 struct pbe * p;
721 722
722 for_each_pbe(p, pagedir_save) { 723 for_each_pbe (p, pagedir_save) {
723 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); 724 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
724 if (!p->address) 725 if (!p->address)
725 return -ENOMEM; 726 return -ENOMEM;
@@ -740,7 +741,7 @@ void swsusp_free(void)
740/** 741/**
741 * enough_free_mem - Make sure we enough free memory to snapshot. 742 * enough_free_mem - Make sure we enough free memory to snapshot.
742 * 743 *
743 * Returns TRUE or FALSE after checking the number of available 744 * Returns TRUE or FALSE after checking the number of available
744 * free pages. 745 * free pages.
745 */ 746 */
746 747
@@ -758,11 +759,11 @@ static int enough_free_mem(void)
758/** 759/**
759 * enough_swap - Make sure we have enough swap to save the image. 760 * enough_swap - Make sure we have enough swap to save the image.
760 * 761 *
761 * Returns TRUE or FALSE after checking the total amount of swap 762 * Returns TRUE or FALSE after checking the total amount of swap
762 * space avaiable. 763 * space avaiable.
763 * 764 *
764 * FIXME: si_swapinfo(&i) returns all swap devices information. 765 * FIXME: si_swapinfo(&i) returns all swap devices information.
765 * We should only consider resume_device. 766 * We should only consider resume_device.
766 */ 767 */
767 768
768static int enough_swap(void) 769static int enough_swap(void)
@@ -781,18 +782,18 @@ static int swsusp_alloc(void)
781{ 782{
782 int error; 783 int error;
783 784
785 pagedir_nosave = NULL;
786 nr_copy_pages = calc_nr(nr_copy_pages);
787
784 pr_debug("suspend: (pages needed: %d + %d free: %d)\n", 788 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
785 nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); 789 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
786 790
787 pagedir_nosave = NULL;
788 if (!enough_free_mem()) 791 if (!enough_free_mem())
789 return -ENOMEM; 792 return -ENOMEM;
790 793
791 if (!enough_swap()) 794 if (!enough_swap())
792 return -ENOSPC; 795 return -ENOSPC;
793 796
794 nr_copy_pages = calc_nr(nr_copy_pages);
795
796 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { 797 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
797 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 798 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
798 return -ENOMEM; 799 return -ENOMEM;
@@ -827,8 +828,8 @@ static int suspend_prepare_image(void)
827 error = swsusp_alloc(); 828 error = swsusp_alloc();
828 if (error) 829 if (error)
829 return error; 830 return error;
830 831
831 /* During allocating of suspend pagedir, new cold pages may appear. 832 /* During allocating of suspend pagedir, new cold pages may appear.
832 * Kill them. 833 * Kill them.
833 */ 834 */
834 drain_local_pages(); 835 drain_local_pages();
@@ -869,13 +870,6 @@ extern asmlinkage int swsusp_arch_resume(void);
869 870
870asmlinkage int swsusp_save(void) 871asmlinkage int swsusp_save(void)
871{ 872{
872 int error = 0;
873
874 if ((error = swsusp_swap_check())) {
875 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
876 "swapon -a!\n");
877 return error;
878 }
879 return suspend_prepare_image(); 873 return suspend_prepare_image();
880} 874}
881 875
@@ -892,14 +886,20 @@ int swsusp_suspend(void)
892 * at resume time, and evil weirdness ensues. 886 * at resume time, and evil weirdness ensues.
893 */ 887 */
894 if ((error = device_power_down(PMSG_FREEZE))) { 888 if ((error = device_power_down(PMSG_FREEZE))) {
895 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
896 local_irq_enable(); 889 local_irq_enable();
897 swsusp_free();
898 return error; 890 return error;
899 } 891 }
892
893 if ((error = swsusp_swap_check())) {
894 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
895 "swapon -a!\n");
896 local_irq_enable();
897 return error;
898 }
899
900 save_processor_state(); 900 save_processor_state();
901 if ((error = swsusp_arch_suspend())) 901 if ((error = swsusp_arch_suspend()))
902 swsusp_free(); 902 printk("Error %d suspending\n", error);
903 /* Restore control flow magically appears here */ 903 /* Restore control flow magically appears here */
904 restore_processor_state(); 904 restore_processor_state();
905 BUG_ON (nr_copy_pages_check != nr_copy_pages); 905 BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -929,21 +929,6 @@ int swsusp_resume(void)
929 return error; 929 return error;
930} 930}
931 931
932/* More restore stuff */
933
934/*
935 * Returns true if given address/order collides with any orig_address
936 */
937static int does_collide_order(unsigned long addr, int order)
938{
939 int i;
940
941 for (i=0; i < (1<<order); i++)
942 if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
943 return 1;
944 return 0;
945}
946
947/** 932/**
948 * On resume, for storing the PBE list and the image, 933 * On resume, for storing the PBE list and the image,
949 * we can only use memory pages that do not conflict with the pages 934 * we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
973 unsigned long m; 958 unsigned long m;
974 959
975 m = get_zeroed_page(gfp_mask); 960 m = get_zeroed_page(gfp_mask);
976 while (does_collide_order(m, 0)) { 961 while (!PageNosaveFree(virt_to_page(m))) {
977 eat_page((void *)m); 962 eat_page((void *)m);
978 m = get_zeroed_page(gfp_mask); 963 m = get_zeroed_page(gfp_mask);
979 if (!m) 964 if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1045 1030
1046 /* Set page flags */ 1031 /* Set page flags */
1047 1032
1048 for_each_zone(zone) { 1033 for_each_zone (zone) {
1049 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1034 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1050 SetPageNosaveFree(pfn_to_page(zone_pfn + 1035 SetPageNosaveFree(pfn_to_page(zone_pfn +
1051 zone->zone_start_pfn)); 1036 zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1061 /* Relocate colliding pages */ 1046 /* Relocate colliding pages */
1062 1047
1063 for_each_pb_page (pbpage, pblist) { 1048 for_each_pb_page (pbpage, pblist) {
1064 if (does_collide_order((unsigned long)pbpage, 0)) { 1049 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1065 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); 1050 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1066 if (!m) { 1051 if (!m) {
1067 error = -ENOMEM; 1052 error = -ENOMEM;
@@ -1181,9 +1166,9 @@ static int bio_write_page(pgoff_t page_off, void * page)
1181static const char * sanity_check(void) 1166static const char * sanity_check(void)
1182{ 1167{
1183 dump_info(); 1168 dump_info();
1184 if(swsusp_info.version_code != LINUX_VERSION_CODE) 1169 if (swsusp_info.version_code != LINUX_VERSION_CODE)
1185 return "kernel version"; 1170 return "kernel version";
1186 if(swsusp_info.num_physpages != num_physpages) 1171 if (swsusp_info.num_physpages != num_physpages)
1187 return "memory size"; 1172 return "memory size";
1188 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) 1173 if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
1189 return "system type"; 1174 return "system type";
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
1193 return "version"; 1178 return "version";
1194 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1195 return "machine"; 1180 return "machine";
1181#if 0
1196 if(swsusp_info.cpus != num_online_cpus()) 1182 if(swsusp_info.cpus != num_online_cpus())
1197 return "number of cpus"; 1183 return "number of cpus";
1184#endif
1198 return NULL; 1185 return NULL;
1199} 1186}
1200 1187
@@ -1274,8 +1261,6 @@ static int data_read(struct pbe *pblist)
1274 return error; 1261 return error;
1275} 1262}
1276 1263
1277extern dev_t name_to_dev_t(const char *line);
1278
1279/** 1264/**
1280 * read_pagedir - Read page backup list pages from swap 1265 * read_pagedir - Read page backup list pages from swap
1281 */ 1266 */
@@ -1369,16 +1354,6 @@ int swsusp_check(void)
1369{ 1354{
1370 int error; 1355 int error;
1371 1356
1372 if (!swsusp_resume_device) {
1373 if (!strlen(resume_file))
1374 return -ENOENT;
1375 swsusp_resume_device = name_to_dev_t(resume_file);
1376 pr_debug("swsusp: Resume From Partition %s\n", resume_file);
1377 } else {
1378 pr_debug("swsusp: Resume From Partition %d:%d\n",
1379 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
1380 }
1381
1382 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 1357 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
1383 if (!IS_ERR(resume_bdev)) { 1358 if (!IS_ERR(resume_bdev)) {
1384 set_blocksize(resume_bdev, PAGE_SIZE); 1359 set_blocksize(resume_bdev, PAGE_SIZE);
diff --git a/kernel/printk.c b/kernel/printk.c
index 01b58d7d17ff..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
588 log_level_unknown = 1; 588 log_level_unknown = 1;
589 } 589 }
590 590
591 if (!cpu_online(smp_processor_id()) && 591 if (!cpu_online(smp_processor_id())) {
592 system_state != SYSTEM_RUNNING) {
593 /* 592 /*
594 * Some console drivers may assume that per-cpu resources have 593 * Some console drivers may assume that per-cpu resources have
595 * been allocated. So don't allow them to be called by this 594 * been allocated. So don't allow them to be called by this
@@ -876,8 +875,10 @@ void register_console(struct console * console)
876 break; 875 break;
877 console->flags |= CON_ENABLED; 876 console->flags |= CON_ENABLED;
878 console->index = console_cmdline[i].index; 877 console->index = console_cmdline[i].index;
879 if (i == preferred_console) 878 if (i == selected_console) {
880 console->flags |= CON_CONSDEV; 879 console->flags |= CON_CONSDEV;
880 preferred_console = selected_console;
881 }
881 break; 882 break;
882 } 883 }
883 884
@@ -897,6 +898,8 @@ void register_console(struct console * console)
897 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 898 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
898 console->next = console_drivers; 899 console->next = console_drivers;
899 console_drivers = console; 900 console_drivers = console;
901 if (console->next)
902 console->next->flags &= ~CON_CONSDEV;
900 } else { 903 } else {
901 console->next = console_drivers->next; 904 console->next = console_drivers->next;
902 console_drivers->next = console; 905 console_drivers->next = console;
@@ -937,10 +940,14 @@ int unregister_console(struct console * console)
937 /* If last console is removed, we re-enable picking the first 940 /* If last console is removed, we re-enable picking the first
938 * one that gets registered. Without that, pmac early boot console 941 * one that gets registered. Without that, pmac early boot console
939 * would prevent fbcon from taking over. 942 * would prevent fbcon from taking over.
943 *
944 * If this isn't the last console and it has CON_CONSDEV set, we
945 * need to set it on the next preferred console.
940 */ 946 */
941 if (console_drivers == NULL) 947 if (console_drivers == NULL)
942 preferred_console = selected_console; 948 preferred_console = selected_console;
943 949 else if (console->flags & CON_CONSDEV)
950 console_drivers->flags |= CON_CONSDEV;
944 951
945 release_console_sem(); 952 release_console_sem();
946 return res; 953 return res;
diff --git a/kernel/profile.c b/kernel/profile.c
index ad8cbb75ffa2..f89248e6d704 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -35,11 +35,11 @@ struct profile_hit {
35#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 35#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
36 36
37/* Oprofile timer tick hook */ 37/* Oprofile timer tick hook */
38int (*timer_hook)(struct pt_regs *); 38int (*timer_hook)(struct pt_regs *) __read_mostly;
39 39
40static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
41static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
42static int prof_on; 42static int prof_on __read_mostly;
43static cpumask_t prof_cpu_mask = CPU_MASK_ALL; 43static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
44#ifdef CONFIG_SMP 44#ifdef CONFIG_SMP
45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); 45static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
263 new->start = min; 263 new->start = min;
264 if (new->end > max) 264 if (new->end > max)
265 new->end = max; 265 new->end = max;
266 new->start = (new->start + align - 1) & ~(align - 1); 266 new->start = ALIGN(new->start, align);
267 if (alignf) 267 if (alignf)
268 alignf(alignf_data, new, size, align); 268 alignf(alignf_data, new, size, align);
269 if (new->start < new->end && new->end - new->start >= size - 1) { 269 if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index deca041fc364..5f889d0cbfcc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
166#define SCALE_PRIO(x, prio) \ 166#define SCALE_PRIO(x, prio) \
167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 167 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
168 168
169static inline unsigned int task_timeslice(task_t *p) 169static unsigned int task_timeslice(task_t *p)
170{ 170{
171 if (p->static_prio < NICE_TO_PRIO(0)) 171 if (p->static_prio < NICE_TO_PRIO(0))
172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 172 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long cpu_load; 209 unsigned long cpu_load[3];
210#endif 210#endif
211 unsigned long long nr_switches; 211 unsigned long long nr_switches;
212 212
@@ -260,22 +260,86 @@ struct runqueue {
260 260
261static DEFINE_PER_CPU(struct runqueue, runqueues); 261static DEFINE_PER_CPU(struct runqueue, runqueues);
262 262
263/*
264 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
265 * See detach_destroy_domains: synchronize_sched for details.
266 *
267 * The domain tree of any CPU may only be accessed from within
268 * preempt-disabled sections.
269 */
263#define for_each_domain(cpu, domain) \ 270#define for_each_domain(cpu, domain) \
264 for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) 271for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
265 272
266#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 273#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
267#define this_rq() (&__get_cpu_var(runqueues)) 274#define this_rq() (&__get_cpu_var(runqueues))
268#define task_rq(p) cpu_rq(task_cpu(p)) 275#define task_rq(p) cpu_rq(task_cpu(p))
269#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 276#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
270 277
271/*
272 * Default context-switch locking:
273 */
274#ifndef prepare_arch_switch 278#ifndef prepare_arch_switch
275# define prepare_arch_switch(rq, next) do { } while (0) 279# define prepare_arch_switch(next) do { } while (0)
276# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) 280#endif
277# define task_running(rq, p) ((rq)->curr == (p)) 281#ifndef finish_arch_switch
282# define finish_arch_switch(prev) do { } while (0)
283#endif
284
285#ifndef __ARCH_WANT_UNLOCKED_CTXSW
286static inline int task_running(runqueue_t *rq, task_t *p)
287{
288 return rq->curr == p;
289}
290
291static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
292{
293}
294
295static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
296{
297 spin_unlock_irq(&rq->lock);
298}
299
300#else /* __ARCH_WANT_UNLOCKED_CTXSW */
301static inline int task_running(runqueue_t *rq, task_t *p)
302{
303#ifdef CONFIG_SMP
304 return p->oncpu;
305#else
306 return rq->curr == p;
307#endif
308}
309
310static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
311{
312#ifdef CONFIG_SMP
313 /*
314 * We can optimise this out completely for !SMP, because the
315 * SMP rebalancing from interrupt is the only thing that cares
316 * here.
317 */
318 next->oncpu = 1;
278#endif 319#endif
320#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
321 spin_unlock_irq(&rq->lock);
322#else
323 spin_unlock(&rq->lock);
324#endif
325}
326
327static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
328{
329#ifdef CONFIG_SMP
330 /*
331 * After ->oncpu is cleared, the task can be moved to a different CPU.
332 * We must ensure this doesn't happen until the switch is completely
333 * finished.
334 */
335 smp_wmb();
336 prev->oncpu = 0;
337#endif
338#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
339 local_irq_enable();
340#endif
341}
342#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
279 343
280/* 344/*
281 * task_rq_lock - lock the runqueue a given task resides on and disable 345 * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
309 * bump this up when changing the output format or the meaning of an existing 373 * bump this up when changing the output format or the meaning of an existing
310 * format, so that tools can adapt (or abort) 374 * format, so that tools can adapt (or abort)
311 */ 375 */
312#define SCHEDSTAT_VERSION 11 376#define SCHEDSTAT_VERSION 12
313 377
314static int show_schedstat(struct seq_file *seq, void *v) 378static int show_schedstat(struct seq_file *seq, void *v)
315{ 379{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
338 402
339#ifdef CONFIG_SMP 403#ifdef CONFIG_SMP
340 /* domain-specific stats */ 404 /* domain-specific stats */
405 preempt_disable();
341 for_each_domain(cpu, sd) { 406 for_each_domain(cpu, sd) {
342 enum idle_type itype; 407 enum idle_type itype;
343 char mask_str[NR_CPUS]; 408 char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
356 sd->lb_nobusyq[itype], 421 sd->lb_nobusyq[itype],
357 sd->lb_nobusyg[itype]); 422 sd->lb_nobusyg[itype]);
358 } 423 }
359 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", 424 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
360 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 425 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
361 sd->sbe_pushed, sd->sbe_attempts, 426 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
427 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
362 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); 428 sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
363 } 429 }
430 preempt_enable();
364#endif 431#endif
365 } 432 }
366 return 0; 433 return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
414 return rq; 481 return rq;
415} 482}
416 483
417#ifdef CONFIG_SCHED_SMT
418static int cpu_and_siblings_are_idle(int cpu)
419{
420 int sib;
421 for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
422 if (idle_cpu(sib))
423 continue;
424 return 0;
425 }
426
427 return 1;
428}
429#else
430#define cpu_and_siblings_are_idle(A) idle_cpu(A)
431#endif
432
433#ifdef CONFIG_SCHEDSTATS 484#ifdef CONFIG_SCHEDSTATS
434/* 485/*
435 * Called when a process is dequeued from the active array and given 486 * Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
622 rq->nr_running++; 673 rq->nr_running++;
623} 674}
624 675
625static void recalc_task_prio(task_t *p, unsigned long long now) 676static int recalc_task_prio(task_t *p, unsigned long long now)
626{ 677{
627 /* Caller must always ensure 'now >= p->timestamp' */ 678 /* Caller must always ensure 'now >= p->timestamp' */
628 unsigned long long __sleep_time = now - p->timestamp; 679 unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
681 } 732 }
682 } 733 }
683 734
684 p->prio = effective_prio(p); 735 return effective_prio(p);
685} 736}
686 737
687/* 738/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
704 } 755 }
705#endif 756#endif
706 757
707 recalc_task_prio(p, now); 758 p->prio = recalc_task_prio(p, now);
708 759
709 /* 760 /*
710 * This checks to make sure it's not an uninterruptible task 761 * This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
782} 833}
783 834
784#ifdef CONFIG_SMP 835#ifdef CONFIG_SMP
785enum request_type {
786 REQ_MOVE_TASK,
787 REQ_SET_DOMAIN,
788};
789
790typedef struct { 836typedef struct {
791 struct list_head list; 837 struct list_head list;
792 enum request_type type;
793 838
794 /* For REQ_MOVE_TASK */
795 task_t *task; 839 task_t *task;
796 int dest_cpu; 840 int dest_cpu;
797 841
798 /* For REQ_SET_DOMAIN */
799 struct sched_domain *sd;
800
801 struct completion done; 842 struct completion done;
802} migration_req_t; 843} migration_req_t;
803 844
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
819 } 860 }
820 861
821 init_completion(&req->done); 862 init_completion(&req->done);
822 req->type = REQ_MOVE_TASK;
823 req->task = p; 863 req->task = p;
824 req->dest_cpu = dest_cpu; 864 req->dest_cpu = dest_cpu;
825 list_add(&req->list, &rq->migration_queue); 865 list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
886 * We want to under-estimate the load of migration sources, to 926 * We want to under-estimate the load of migration sources, to
887 * balance conservatively. 927 * balance conservatively.
888 */ 928 */
889static inline unsigned long source_load(int cpu) 929static inline unsigned long source_load(int cpu, int type)
890{ 930{
891 runqueue_t *rq = cpu_rq(cpu); 931 runqueue_t *rq = cpu_rq(cpu);
892 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 932 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
933 if (type == 0)
934 return load_now;
893 935
894 return min(rq->cpu_load, load_now); 936 return min(rq->cpu_load[type-1], load_now);
895} 937}
896 938
897/* 939/*
898 * Return a high guess at the load of a migration-target cpu 940 * Return a high guess at the load of a migration-target cpu
899 */ 941 */
900static inline unsigned long target_load(int cpu) 942static inline unsigned long target_load(int cpu, int type)
901{ 943{
902 runqueue_t *rq = cpu_rq(cpu); 944 runqueue_t *rq = cpu_rq(cpu);
903 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 945 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
946 if (type == 0)
947 return load_now;
904 948
905 return max(rq->cpu_load, load_now); 949 return max(rq->cpu_load[type-1], load_now);
906} 950}
907 951
908#endif 952/*
953 * find_idlest_group finds and returns the least busy CPU group within the
954 * domain.
955 */
956static struct sched_group *
957find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
958{
959 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
960 unsigned long min_load = ULONG_MAX, this_load = 0;
961 int load_idx = sd->forkexec_idx;
962 int imbalance = 100 + (sd->imbalance_pct-100)/2;
963
964 do {
965 unsigned long load, avg_load;
966 int local_group;
967 int i;
968
969 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971
972 /* Tally up the load of all CPUs in the group */
973 avg_load = 0;
974
975 for_each_cpu_mask(i, group->cpumask) {
976 /* Bias balancing toward cpus of our domain */
977 if (local_group)
978 load = source_load(i, load_idx);
979 else
980 load = target_load(i, load_idx);
981
982 avg_load += load;
983 }
984
985 /* Adjust by relative CPU power of the group */
986 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
987
988 if (local_group) {
989 this_load = avg_load;
990 this = group;
991 } else if (avg_load < min_load) {
992 min_load = avg_load;
993 idlest = group;
994 }
995 group = group->next;
996 } while (group != sd->groups);
997
998 if (!idlest || 100*this_load < imbalance*min_load)
999 return NULL;
1000 return idlest;
1001}
1002
1003/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu)
1007{
1008 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1;
1010 int i;
1011
1012 for_each_cpu_mask(i, group->cpumask) {
1013 load = source_load(i, 0);
1014
1015 if (load < min_load || (load == min_load && i == this_cpu)) {
1016 min_load = load;
1017 idlest = i;
1018 }
1019 }
1020
1021 return idlest;
1022}
1023
1024/*
1025 * sched_balance_self: balance the current task (running on cpu) in domains
1026 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1027 * SD_BALANCE_EXEC.
1028 *
1029 * Balance, ie. select the least loaded group.
1030 *
1031 * Returns the target CPU number, or the same CPU if no balancing is needed.
1032 *
1033 * preempt must be disabled.
1034 */
1035static int sched_balance_self(int cpu, int flag)
1036{
1037 struct task_struct *t = current;
1038 struct sched_domain *tmp, *sd = NULL;
1039
1040 for_each_domain(cpu, tmp)
1041 if (tmp->flags & flag)
1042 sd = tmp;
1043
1044 while (sd) {
1045 cpumask_t span;
1046 struct sched_group *group;
1047 int new_cpu;
1048 int weight;
1049
1050 span = sd->span;
1051 group = find_idlest_group(sd, t, cpu);
1052 if (!group)
1053 goto nextlevel;
1054
1055 new_cpu = find_idlest_cpu(group, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel;
1058
1059 /* Now try balancing at a lower domain level */
1060 cpu = new_cpu;
1061nextlevel:
1062 sd = NULL;
1063 weight = cpus_weight(span);
1064 for_each_domain(cpu, tmp) {
1065 if (weight <= cpus_weight(tmp->span))
1066 break;
1067 if (tmp->flags & flag)
1068 sd = tmp;
1069 }
1070 /* while loop will break here if sd == NULL */
1071 }
1072
1073 return cpu;
1074}
1075
1076#endif /* CONFIG_SMP */
909 1077
910/* 1078/*
911 * wake_idle() will wake a task on an idle cpu if task->cpu is 1079 * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
927 1095
928 for_each_domain(cpu, sd) { 1096 for_each_domain(cpu, sd) {
929 if (sd->flags & SD_WAKE_IDLE) { 1097 if (sd->flags & SD_WAKE_IDLE) {
930 cpus_and(tmp, sd->span, cpu_online_map); 1098 cpus_and(tmp, sd->span, p->cpus_allowed);
931 cpus_and(tmp, tmp, p->cpus_allowed);
932 for_each_cpu_mask(i, tmp) { 1099 for_each_cpu_mask(i, tmp) {
933 if (idle_cpu(i)) 1100 if (idle_cpu(i))
934 return i; 1101 return i;
935 } 1102 }
936 } 1103 }
937 else break; 1104 else
1105 break;
938 } 1106 }
939 return cpu; 1107 return cpu;
940} 1108}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
967 runqueue_t *rq; 1135 runqueue_t *rq;
968#ifdef CONFIG_SMP 1136#ifdef CONFIG_SMP
969 unsigned long load, this_load; 1137 unsigned long load, this_load;
970 struct sched_domain *sd; 1138 struct sched_domain *sd, *this_sd = NULL;
971 int new_cpu; 1139 int new_cpu;
972#endif 1140#endif
973 1141
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
986 if (unlikely(task_running(rq, p))) 1154 if (unlikely(task_running(rq, p)))
987 goto out_activate; 1155 goto out_activate;
988 1156
989#ifdef CONFIG_SCHEDSTATS 1157 new_cpu = cpu;
1158
990 schedstat_inc(rq, ttwu_cnt); 1159 schedstat_inc(rq, ttwu_cnt);
991 if (cpu == this_cpu) { 1160 if (cpu == this_cpu) {
992 schedstat_inc(rq, ttwu_local); 1161 schedstat_inc(rq, ttwu_local);
993 } else { 1162 goto out_set_cpu;
994 for_each_domain(this_cpu, sd) { 1163 }
995 if (cpu_isset(cpu, sd->span)) { 1164
996 schedstat_inc(sd, ttwu_wake_remote); 1165 for_each_domain(this_cpu, sd) {
997 break; 1166 if (cpu_isset(cpu, sd->span)) {
998 } 1167 schedstat_inc(sd, ttwu_wake_remote);
1168 this_sd = sd;
1169 break;
999 } 1170 }
1000 } 1171 }
1001#endif
1002 1172
1003 new_cpu = cpu; 1173 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1004 if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu; 1174 goto out_set_cpu;
1006 1175
1007 load = source_load(cpu);
1008 this_load = target_load(this_cpu);
1009
1010 /* 1176 /*
1011 * If sync wakeup then subtract the (maximum possible) effect of 1177 * Check for affine wakeup and passive balancing possibilities.
1012 * the currently running task from the load of the current CPU:
1013 */ 1178 */
1014 if (sync) 1179 if (this_sd) {
1015 this_load -= SCHED_LOAD_SCALE; 1180 int idx = this_sd->wake_idx;
1181 unsigned int imbalance;
1016 1182
1017 /* Don't pull the task off an idle CPU to a busy one */ 1183 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1018 if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
1019 goto out_set_cpu;
1020 1184
1021 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1185 load = source_load(cpu, idx);
1186 this_load = target_load(this_cpu, idx);
1022 1187
1023 /* 1188 new_cpu = this_cpu; /* Wake to this CPU if we can */
1024 * Scan domains for affine wakeup and passive balancing
1025 * possibilities.
1026 */
1027 for_each_domain(this_cpu, sd) {
1028 unsigned int imbalance;
1029 /*
1030 * Start passive balancing when half the imbalance_pct
1031 * limit is reached.
1032 */
1033 imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
1034 1189
1035 if ((sd->flags & SD_WAKE_AFFINE) && 1190 if (this_sd->flags & SD_WAKE_AFFINE) {
1036 !task_hot(p, rq->timestamp_last_tick, sd)) { 1191 unsigned long tl = this_load;
1037 /* 1192 /*
1038 * This domain has SD_WAKE_AFFINE and p is cache cold 1193 * If sync wakeup then subtract the (maximum possible)
1039 * in this domain. 1194 * effect of the currently running task from the load
1195 * of the current CPU:
1040 */ 1196 */
1041 if (cpu_isset(cpu, sd->span)) { 1197 if (sync)
1042 schedstat_inc(sd, ttwu_move_affine); 1198 tl -= SCHED_LOAD_SCALE;
1199
1200 if ((tl <= load &&
1201 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
1202 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
1203 /*
1204 * This domain has SD_WAKE_AFFINE and
1205 * p is cache cold in this domain, and
1206 * there is no bad imbalance.
1207 */
1208 schedstat_inc(this_sd, ttwu_move_affine);
1043 goto out_set_cpu; 1209 goto out_set_cpu;
1044 } 1210 }
1045 } else if ((sd->flags & SD_WAKE_BALANCE) && 1211 }
1046 imbalance*this_load <= 100*load) { 1212
1047 /* 1213 /*
1048 * This domain has SD_WAKE_BALANCE and there is 1214 * Start passive balancing when half the imbalance_pct
1049 * an imbalance. 1215 * limit is reached.
1050 */ 1216 */
1051 if (cpu_isset(cpu, sd->span)) { 1217 if (this_sd->flags & SD_WAKE_BALANCE) {
1052 schedstat_inc(sd, ttwu_move_balance); 1218 if (imbalance*this_load <= 100*load) {
1219 schedstat_inc(this_sd, ttwu_move_balance);
1053 goto out_set_cpu; 1220 goto out_set_cpu;
1054 } 1221 }
1055 } 1222 }
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1120 return try_to_wake_up(p, state, 0); 1287 return try_to_wake_up(p, state, 0);
1121} 1288}
1122 1289
1123#ifdef CONFIG_SMP
1124static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1125 struct sched_domain *sd);
1126#endif
1127
1128/* 1290/*
1129 * Perform scheduler related setup for a newly forked process p. 1291 * Perform scheduler related setup for a newly forked process p.
1130 * p is forked by current. 1292 * p is forked by current.
1131 */ 1293 */
1132void fastcall sched_fork(task_t *p) 1294void fastcall sched_fork(task_t *p, int clone_flags)
1133{ 1295{
1296 int cpu = get_cpu();
1297
1298#ifdef CONFIG_SMP
1299 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1300#endif
1301 set_task_cpu(p, cpu);
1302
1134 /* 1303 /*
1135 * We mark the process as running here, but have not actually 1304 * We mark the process as running here, but have not actually
1136 * inserted it onto the runqueue yet. This guarantees that 1305 * inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
1140 p->state = TASK_RUNNING; 1309 p->state = TASK_RUNNING;
1141 INIT_LIST_HEAD(&p->run_list); 1310 INIT_LIST_HEAD(&p->run_list);
1142 p->array = NULL; 1311 p->array = NULL;
1143 spin_lock_init(&p->switch_lock);
1144#ifdef CONFIG_SCHEDSTATS 1312#ifdef CONFIG_SCHEDSTATS
1145 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1313 memset(&p->sched_info, 0, sizeof(p->sched_info));
1146#endif 1314#endif
1315#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1316 p->oncpu = 0;
1317#endif
1147#ifdef CONFIG_PREEMPT 1318#ifdef CONFIG_PREEMPT
1148 /* 1319 /* Want to start with kernel preemption disabled. */
1149 * During context-switch we hold precisely one spinlock, which
1150 * schedule_tail drops. (in the common case it's this_rq()->lock,
1151 * but it also can be p->switch_lock.) So we compensate with a count
1152 * of 1. Also, we want to start with kernel preemption disabled.
1153 */
1154 p->thread_info->preempt_count = 1; 1320 p->thread_info->preempt_count = 1;
1155#endif 1321#endif
1156 /* 1322 /*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
1174 * runqueue lock is not a problem. 1340 * runqueue lock is not a problem.
1175 */ 1341 */
1176 current->time_slice = 1; 1342 current->time_slice = 1;
1177 preempt_disable();
1178 scheduler_tick(); 1343 scheduler_tick();
1179 local_irq_enable(); 1344 }
1180 preempt_enable(); 1345 local_irq_enable();
1181 } else 1346 put_cpu();
1182 local_irq_enable();
1183} 1347}
1184 1348
1185/* 1349/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1196 runqueue_t *rq, *this_rq; 1360 runqueue_t *rq, *this_rq;
1197 1361
1198 rq = task_rq_lock(p, &flags); 1362 rq = task_rq_lock(p, &flags);
1199 cpu = task_cpu(p);
1200 this_cpu = smp_processor_id();
1201
1202 BUG_ON(p->state != TASK_RUNNING); 1363 BUG_ON(p->state != TASK_RUNNING);
1364 this_cpu = smp_processor_id();
1365 cpu = task_cpu(p);
1203 1366
1204 /* 1367 /*
1205 * We decrease the sleep average of forking parents 1368 * We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
1296} 1459}
1297 1460
1298/** 1461/**
1462 * prepare_task_switch - prepare to switch tasks
1463 * @rq: the runqueue preparing to switch
1464 * @next: the task we are going to switch to.
1465 *
1466 * This is called with the rq lock held and interrupts off. It must
1467 * be paired with a subsequent finish_task_switch after the context
1468 * switch.
1469 *
1470 * prepare_task_switch sets up locking and calls architecture specific
1471 * hooks.
1472 */
1473static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1474{
1475 prepare_lock_switch(rq, next);
1476 prepare_arch_switch(next);
1477}
1478
1479/**
1299 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1300 * @prev: the thread we just switched away from. 1481 * @prev: the thread we just switched away from.
1301 * 1482 *
1302 * We enter this with the runqueue still locked, and finish_arch_switch() 1483 * finish_task_switch must be called after the context switch, paired
1303 * will unlock it along with doing any other architecture-specific cleanup 1484 * with a prepare_task_switch call before the context switch.
1304 * actions. 1485 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1486 * and do any other architecture-specific cleanup actions.
1305 * 1487 *
1306 * Note that we may have delayed dropping an mm in context_switch(). If 1488 * Note that we may have delayed dropping an mm in context_switch(). If
1307 * so, we finish that here outside of the runqueue lock. (Doing it 1489 * so, we finish that here outside of the runqueue lock. (Doing it
1308 * with the lock held can cause deadlocks; see schedule() for 1490 * with the lock held can cause deadlocks; see schedule() for
1309 * details.) 1491 * details.)
1310 */ 1492 */
1311static inline void finish_task_switch(task_t *prev) 1493static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1312 __releases(rq->lock) 1494 __releases(rq->lock)
1313{ 1495{
1314 runqueue_t *rq = this_rq();
1315 struct mm_struct *mm = rq->prev_mm; 1496 struct mm_struct *mm = rq->prev_mm;
1316 unsigned long prev_task_flags; 1497 unsigned long prev_task_flags;
1317 1498
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
1329 * Manfred Spraul <manfred@colorfullife.com> 1510 * Manfred Spraul <manfred@colorfullife.com>
1330 */ 1511 */
1331 prev_task_flags = prev->flags; 1512 prev_task_flags = prev->flags;
1332 finish_arch_switch(rq, prev); 1513 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev);
1333 if (mm) 1515 if (mm)
1334 mmdrop(mm); 1516 mmdrop(mm);
1335 if (unlikely(prev_task_flags & PF_DEAD)) 1517 if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
1343asmlinkage void schedule_tail(task_t *prev) 1525asmlinkage void schedule_tail(task_t *prev)
1344 __releases(rq->lock) 1526 __releases(rq->lock)
1345{ 1527{
1346 finish_task_switch(prev); 1528 runqueue_t *rq = this_rq();
1347 1529 finish_task_switch(rq, prev);
1530#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1531 /* In this case, finish_task_switch does not reenable preemption */
1532 preempt_enable();
1533#endif
1348 if (current->set_child_tid) 1534 if (current->set_child_tid)
1349 put_user(current->pid, current->set_child_tid); 1535 put_user(current->pid, current->set_child_tid);
1350} 1536}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1494} 1680}
1495 1681
1496/* 1682/*
1497 * find_idlest_cpu - find the least busy runqueue.
1498 */
1499static int find_idlest_cpu(struct task_struct *p, int this_cpu,
1500 struct sched_domain *sd)
1501{
1502 unsigned long load, min_load, this_load;
1503 int i, min_cpu;
1504 cpumask_t mask;
1505
1506 min_cpu = UINT_MAX;
1507 min_load = ULONG_MAX;
1508
1509 cpus_and(mask, sd->span, p->cpus_allowed);
1510
1511 for_each_cpu_mask(i, mask) {
1512 load = target_load(i);
1513
1514 if (load < min_load) {
1515 min_cpu = i;
1516 min_load = load;
1517
1518 /* break out early on an idle CPU: */
1519 if (!min_load)
1520 break;
1521 }
1522 }
1523
1524 /* add +1 to account for the new task */
1525 this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
1526
1527 /*
1528 * Would with the addition of the new task to the
1529 * current CPU there be an imbalance between this
1530 * CPU and the idlest CPU?
1531 *
1532 * Use half of the balancing threshold - new-context is
1533 * a good opportunity to balance.
1534 */
1535 if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
1536 return min_cpu;
1537
1538 return this_cpu;
1539}
1540
1541/*
1542 * If dest_cpu is allowed for this process, migrate the task to it. 1683 * If dest_cpu is allowed for this process, migrate the task to it.
1543 * This is accomplished by forcing the cpu_allowed mask to only 1684 * This is accomplished by forcing the cpu_allowed mask to only
1544 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1685 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
1571} 1712}
1572 1713
1573/* 1714/*
1574 * sched_exec(): find the highest-level, exec-balance-capable 1715 * sched_exec - execve() is a valuable balancing opportunity, because at
1575 * domain and try to migrate the task to the least loaded CPU. 1716 * this point the task has the smallest effective memory and cache footprint.
1576 *
1577 * execve() is a valuable balancing opportunity, because at this point
1578 * the task has the smallest effective memory and cache footprint.
1579 */ 1717 */
1580void sched_exec(void) 1718void sched_exec(void)
1581{ 1719{
1582 struct sched_domain *tmp, *sd = NULL;
1583 int new_cpu, this_cpu = get_cpu(); 1720 int new_cpu, this_cpu = get_cpu();
1584 1721 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
1585 /* Prefer the current CPU if there's only this task running */
1586 if (this_rq()->nr_running <= 1)
1587 goto out;
1588
1589 for_each_domain(this_cpu, tmp)
1590 if (tmp->flags & SD_BALANCE_EXEC)
1591 sd = tmp;
1592
1593 if (sd) {
1594 schedstat_inc(sd, sbe_attempts);
1595 new_cpu = find_idlest_cpu(current, this_cpu, sd);
1596 if (new_cpu != this_cpu) {
1597 schedstat_inc(sd, sbe_pushed);
1598 put_cpu();
1599 sched_migrate_task(current, new_cpu);
1600 return;
1601 }
1602 }
1603out:
1604 put_cpu(); 1722 put_cpu();
1723 if (new_cpu != this_cpu)
1724 sched_migrate_task(current, new_cpu);
1605} 1725}
1606 1726
1607/* 1727/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1632 */ 1752 */
1633static inline 1753static inline
1634int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1635 struct sched_domain *sd, enum idle_type idle) 1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned)
1636{ 1756{
1637 /* 1757 /*
1638 * We do not migrate tasks that are: 1758 * We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1640 * 2) cannot be migrated to this CPU due to cpus_allowed, or 1760 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1641 * 3) are cache-hot on their current CPU. 1761 * 3) are cache-hot on their current CPU.
1642 */ 1762 */
1643 if (task_running(rq, p))
1644 return 0;
1645 if (!cpu_isset(this_cpu, p->cpus_allowed)) 1763 if (!cpu_isset(this_cpu, p->cpus_allowed))
1646 return 0; 1764 return 0;
1765 *all_pinned = 0;
1766
1767 if (task_running(rq, p))
1768 return 0;
1647 1769
1648 /* 1770 /*
1649 * Aggressive migration if: 1771 * Aggressive migration if:
1650 * 1) the [whole] cpu is idle, or 1772 * 1) task is cache cold, or
1651 * 2) too many balance attempts have failed. 1773 * 2) too many balance attempts have failed.
1652 */ 1774 */
1653 1775
1654 if (cpu_and_siblings_are_idle(this_cpu) || \ 1776 if (sd->nr_balance_failed > sd->cache_nice_tries)
1655 sd->nr_balance_failed > sd->cache_nice_tries)
1656 return 1; 1777 return 1;
1657 1778
1658 if (task_hot(p, rq->timestamp_last_tick, sd)) 1779 if (task_hot(p, rq->timestamp_last_tick, sd))
1659 return 0; 1780 return 0;
1660 return 1; 1781 return 1;
1661} 1782}
1662 1783
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1669 */ 1790 */
1670static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 1791static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1671 unsigned long max_nr_move, struct sched_domain *sd, 1792 unsigned long max_nr_move, struct sched_domain *sd,
1672 enum idle_type idle) 1793 enum idle_type idle, int *all_pinned)
1673{ 1794{
1674 prio_array_t *array, *dst_array; 1795 prio_array_t *array, *dst_array;
1675 struct list_head *head, *curr; 1796 struct list_head *head, *curr;
1676 int idx, pulled = 0; 1797 int idx, pulled = 0, pinned = 0;
1677 task_t *tmp; 1798 task_t *tmp;
1678 1799
1679 if (max_nr_move <= 0 || busiest->nr_running <= 1) 1800 if (max_nr_move == 0)
1680 goto out; 1801 goto out;
1681 1802
1803 pinned = 1;
1804
1682 /* 1805 /*
1683 * We first consider expired tasks. Those will likely not be 1806 * We first consider expired tasks. Those will likely not be
1684 * executed in the near future, and they are most likely to 1807 * executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
1717 1840
1718 curr = curr->prev; 1841 curr = curr->prev;
1719 1842
1720 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { 1843 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
1721 if (curr != head) 1844 if (curr != head)
1722 goto skip_queue; 1845 goto skip_queue;
1723 idx++; 1846 idx++;
@@ -1746,6 +1869,9 @@ out:
1746 * inside pull_task(). 1869 * inside pull_task().
1747 */ 1870 */
1748 schedstat_add(sd, lb_gained[idle], pulled); 1871 schedstat_add(sd, lb_gained[idle], pulled);
1872
1873 if (all_pinned)
1874 *all_pinned = pinned;
1749 return pulled; 1875 return pulled;
1750} 1876}
1751 1877
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1760{ 1886{
1761 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1762 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1889 int load_idx;
1763 1890
1764 max_load = this_load = total_load = total_pwr = 0; 1891 max_load = this_load = total_load = total_pwr = 0;
1892 if (idle == NOT_IDLE)
1893 load_idx = sd->busy_idx;
1894 else if (idle == NEWLY_IDLE)
1895 load_idx = sd->newidle_idx;
1896 else
1897 load_idx = sd->idle_idx;
1765 1898
1766 do { 1899 do {
1767 unsigned long load; 1900 unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1776 for_each_cpu_mask(i, group->cpumask) { 1909 for_each_cpu_mask(i, group->cpumask) {
1777 /* Bias balancing toward cpus of our domain */ 1910 /* Bias balancing toward cpus of our domain */
1778 if (local_group) 1911 if (local_group)
1779 load = target_load(i); 1912 load = target_load(i, load_idx);
1780 else 1913 else
1781 load = source_load(i); 1914 load = source_load(i, load_idx);
1782 1915
1783 avg_load += load; 1916 avg_load += load;
1784 } 1917 }
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1792 if (local_group) { 1925 if (local_group) {
1793 this_load = avg_load; 1926 this_load = avg_load;
1794 this = group; 1927 this = group;
1795 goto nextgroup;
1796 } else if (avg_load > max_load) { 1928 } else if (avg_load > max_load) {
1797 max_load = avg_load; 1929 max_load = avg_load;
1798 busiest = group; 1930 busiest = group;
1799 } 1931 }
1800nextgroup:
1801 group = group->next; 1932 group = group->next;
1802 } while (group != sd->groups); 1933 } while (group != sd->groups);
1803 1934
@@ -1870,15 +2001,9 @@ nextgroup:
1870 2001
1871 /* Get rid of the scaling factor, rounding down as we divide */ 2002 /* Get rid of the scaling factor, rounding down as we divide */
1872 *imbalance = *imbalance / SCHED_LOAD_SCALE; 2003 *imbalance = *imbalance / SCHED_LOAD_SCALE;
1873
1874 return busiest; 2004 return busiest;
1875 2005
1876out_balanced: 2006out_balanced:
1877 if (busiest && (idle == NEWLY_IDLE ||
1878 (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
1879 *imbalance = 1;
1880 return busiest;
1881 }
1882 2007
1883 *imbalance = 0; 2008 *imbalance = 0;
1884 return NULL; 2009 return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1894 int i; 2019 int i;
1895 2020
1896 for_each_cpu_mask(i, group->cpumask) { 2021 for_each_cpu_mask(i, group->cpumask) {
1897 load = source_load(i); 2022 load = source_load(i, 0);
1898 2023
1899 if (load > max_load) { 2024 if (load > max_load) {
1900 max_load = load; 2025 max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
1906} 2031}
1907 2032
1908/* 2033/*
2034 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2035 * so long as it is large enough.
2036 */
2037#define MAX_PINNED_INTERVAL 512
2038
2039/*
1909 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2040 * Check this_cpu to ensure it is balanced within domain. Attempt to move
1910 * tasks if there is an imbalance. 2041 * tasks if there is an imbalance.
1911 * 2042 *
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1917 struct sched_group *group; 2048 struct sched_group *group;
1918 runqueue_t *busiest; 2049 runqueue_t *busiest;
1919 unsigned long imbalance; 2050 unsigned long imbalance;
1920 int nr_moved; 2051 int nr_moved, all_pinned = 0;
2052 int active_balance = 0;
1921 2053
1922 spin_lock(&this_rq->lock); 2054 spin_lock(&this_rq->lock);
1923 schedstat_inc(sd, lb_cnt[idle]); 2055 schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1934 goto out_balanced; 2066 goto out_balanced;
1935 } 2067 }
1936 2068
1937 /* 2069 BUG_ON(busiest == this_rq);
1938 * This should be "impossible", but since load
1939 * balancing is inherently racy and statistical,
1940 * it could happen in theory.
1941 */
1942 if (unlikely(busiest == this_rq)) {
1943 WARN_ON(1);
1944 goto out_balanced;
1945 }
1946 2070
1947 schedstat_add(sd, lb_imbalance[idle], imbalance); 2071 schedstat_add(sd, lb_imbalance[idle], imbalance);
1948 2072
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1956 */ 2080 */
1957 double_lock_balance(this_rq, busiest); 2081 double_lock_balance(this_rq, busiest);
1958 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2082 nr_moved = move_tasks(this_rq, this_cpu, busiest,
1959 imbalance, sd, idle); 2083 imbalance, sd, idle,
2084 &all_pinned);
1960 spin_unlock(&busiest->lock); 2085 spin_unlock(&busiest->lock);
2086
2087 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned))
2089 goto out_balanced;
1961 } 2090 }
2091
1962 spin_unlock(&this_rq->lock); 2092 spin_unlock(&this_rq->lock);
1963 2093
1964 if (!nr_moved) { 2094 if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
1966 sd->nr_balance_failed++; 2096 sd->nr_balance_failed++;
1967 2097
1968 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
1969 int wake = 0;
1970 2099
1971 spin_lock(&busiest->lock); 2100 spin_lock(&busiest->lock);
1972 if (!busiest->active_balance) { 2101 if (!busiest->active_balance) {
1973 busiest->active_balance = 1; 2102 busiest->active_balance = 1;
1974 busiest->push_cpu = this_cpu; 2103 busiest->push_cpu = this_cpu;
1975 wake = 1; 2104 active_balance = 1;
1976 } 2105 }
1977 spin_unlock(&busiest->lock); 2106 spin_unlock(&busiest->lock);
1978 if (wake) 2107 if (active_balance)
1979 wake_up_process(busiest->migration_thread); 2108 wake_up_process(busiest->migration_thread);
1980 2109
1981 /* 2110 /*
1982 * We've kicked active balancing, reset the failure 2111 * We've kicked active balancing, reset the failure
1983 * counter. 2112 * counter.
1984 */ 2113 */
1985 sd->nr_balance_failed = sd->cache_nice_tries; 2114 sd->nr_balance_failed = sd->cache_nice_tries+1;
1986 } 2115 }
1987 2116 } else
1988 /*
1989 * We were unbalanced, but unsuccessful in move_tasks(),
1990 * so bump the balance_interval to lessen the lock contention.
1991 */
1992 if (sd->balance_interval < sd->max_interval)
1993 sd->balance_interval++;
1994 } else {
1995 sd->nr_balance_failed = 0; 2117 sd->nr_balance_failed = 0;
1996 2118
2119 if (likely(!active_balance)) {
1997 /* We were unbalanced, so reset the balancing interval */ 2120 /* We were unbalanced, so reset the balancing interval */
1998 sd->balance_interval = sd->min_interval; 2121 sd->balance_interval = sd->min_interval;
2122 } else {
2123 /*
2124 * If we've begun active balancing, start to back off. This
2125 * case may not be covered by the all_pinned logic if there
2126 * is only 1 task on the busy runqueue (because we don't call
2127 * move_tasks).
2128 */
2129 if (sd->balance_interval < sd->max_interval)
2130 sd->balance_interval *= 2;
1999 } 2131 }
2000 2132
2001 return nr_moved; 2133 return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
2005 2137
2006 schedstat_inc(sd, lb_balanced[idle]); 2138 schedstat_inc(sd, lb_balanced[idle]);
2007 2139
2140 sd->nr_balance_failed = 0;
2008 /* tune up the balancing interval */ 2141 /* tune up the balancing interval */
2009 if (sd->balance_interval < sd->max_interval) 2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval))
2010 sd->balance_interval *= 2; 2144 sd->balance_interval *= 2;
2011 2145
2012 return 0; 2146 return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2030 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2031 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
2032 if (!group) { 2166 if (!group) {
2033 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2034 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2035 goto out; 2168 goto out_balanced;
2036 } 2169 }
2037 2170
2038 busiest = find_busiest_queue(group); 2171 busiest = find_busiest_queue(group);
2039 if (!busiest || busiest == this_rq) { 2172 if (!busiest) {
2040 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2041 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2173 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2042 goto out; 2174 goto out_balanced;
2043 } 2175 }
2044 2176
2177 BUG_ON(busiest == this_rq);
2178
2045 /* Attempt to move tasks */ 2179 /* Attempt to move tasks */
2046 double_lock_balance(this_rq, busiest); 2180 double_lock_balance(this_rq, busiest);
2047 2181
2048 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2049 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2183 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2050 imbalance, sd, NEWLY_IDLE); 2184 imbalance, sd, NEWLY_IDLE, NULL);
2051 if (!nr_moved) 2185 if (!nr_moved)
2052 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else
2188 sd->nr_balance_failed = 0;
2053 2189
2054 spin_unlock(&busiest->lock); 2190 spin_unlock(&busiest->lock);
2055
2056out:
2057 return nr_moved; 2191 return nr_moved;
2192
2193out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2195 sd->nr_balance_failed = 0;
2196 return 0;
2058} 2197}
2059 2198
2060/* 2199/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
2086static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2225static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2087{ 2226{
2088 struct sched_domain *sd; 2227 struct sched_domain *sd;
2089 struct sched_group *cpu_group;
2090 runqueue_t *target_rq; 2228 runqueue_t *target_rq;
2091 cpumask_t visited_cpus; 2229 int target_cpu = busiest_rq->push_cpu;
2092 int cpu; 2230
2231 if (busiest_rq->nr_running <= 1)
2232 /* no task to move */
2233 return;
2234
2235 target_rq = cpu_rq(target_cpu);
2093 2236
2094 /* 2237 /*
2095 * Search for suitable CPUs to push tasks to in successively higher 2238 * This condition is "impossible", if it occurs
2096 * domains with SD_LOAD_BALANCE set. 2239 * we need to fix it. Originally reported by
2240 * Bjorn Helgaas on a 128-cpu setup.
2097 */ 2241 */
2098 visited_cpus = CPU_MASK_NONE; 2242 BUG_ON(busiest_rq == target_rq);
2099 for_each_domain(busiest_cpu, sd) {
2100 if (!(sd->flags & SD_LOAD_BALANCE))
2101 /* no more domains to search */
2102 break;
2103 2243
2104 schedstat_inc(sd, alb_cnt); 2244 /* move a task from busiest_rq to target_rq */
2245 double_lock_balance(busiest_rq, target_rq);
2105 2246
2106 cpu_group = sd->groups; 2247 /* Search for an sd spanning us and the target CPU. */
2107 do { 2248 for_each_domain(target_cpu, sd)
2108 for_each_cpu_mask(cpu, cpu_group->cpumask) { 2249 if ((sd->flags & SD_LOAD_BALANCE) &&
2109 if (busiest_rq->nr_running <= 1) 2250 cpu_isset(busiest_cpu, sd->span))
2110 /* no more tasks left to move */ 2251 break;
2111 return; 2252
2112 if (cpu_isset(cpu, visited_cpus)) 2253 if (unlikely(sd == NULL))
2113 continue; 2254 goto out;
2114 cpu_set(cpu, visited_cpus); 2255
2115 if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) 2256 schedstat_inc(sd, alb_cnt);
2116 continue; 2257
2117 2258 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
2118 target_rq = cpu_rq(cpu); 2259 schedstat_inc(sd, alb_pushed);
2119 /* 2260 else
2120 * This condition is "impossible", if it occurs 2261 schedstat_inc(sd, alb_failed);
2121 * we need to fix it. Originally reported by 2262out:
2122 * Bjorn Helgaas on a 128-cpu setup. 2263 spin_unlock(&target_rq->lock);
2123 */
2124 BUG_ON(busiest_rq == target_rq);
2125
2126 /* move a task from busiest_rq to target_rq */
2127 double_lock_balance(busiest_rq, target_rq);
2128 if (move_tasks(target_rq, cpu, busiest_rq,
2129 1, sd, SCHED_IDLE)) {
2130 schedstat_inc(sd, alb_pushed);
2131 } else {
2132 schedstat_inc(sd, alb_failed);
2133 }
2134 spin_unlock(&target_rq->lock);
2135 }
2136 cpu_group = cpu_group->next;
2137 } while (cpu_group != sd->groups);
2138 }
2139} 2264}
2140 2265
2141/* 2266/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2156 unsigned long old_load, this_load; 2281 unsigned long old_load, this_load;
2157 unsigned long j = jiffies + CPU_OFFSET(this_cpu); 2282 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2158 struct sched_domain *sd; 2283 struct sched_domain *sd;
2284 int i;
2159 2285
2160 /* Update our load */
2161 old_load = this_rq->cpu_load;
2162 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2286 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2163 /* 2287 /* Update our load */
2164 * Round up the averaging division if load is increasing. This 2288 for (i = 0; i < 3; i++) {
2165 * prevents us from getting stuck on 9 if the load is 10, for 2289 unsigned long new_load = this_load;
2166 * example. 2290 int scale = 1 << i;
2167 */ 2291 old_load = this_rq->cpu_load[i];
2168 if (this_load > old_load) 2292 /*
2169 old_load++; 2293 * Round up the averaging division if load is increasing. This
2170 this_rq->cpu_load = (old_load + this_load) / 2; 2294 * prevents us from getting stuck on 9 if the load is 10, for
2295 * example.
2296 */
2297 if (new_load > old_load)
2298 new_load += scale-1;
2299 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
2300 }
2171 2301
2172 for_each_domain(this_cpu, sd) { 2302 for_each_domain(this_cpu, sd) {
2173 unsigned long interval; 2303 unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
2447#ifdef CONFIG_SCHED_SMT 2577#ifdef CONFIG_SCHED_SMT
2448static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2449{ 2579{
2450 struct sched_domain *sd = this_rq->sd; 2580 struct sched_domain *tmp, *sd = NULL;
2451 cpumask_t sibling_map; 2581 cpumask_t sibling_map;
2452 int i; 2582 int i;
2453 2583
2454 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2584 for_each_domain(this_cpu, tmp)
2585 if (tmp->flags & SD_SHARE_CPUPOWER)
2586 sd = tmp;
2587
2588 if (!sd)
2455 return; 2589 return;
2456 2590
2457 /* 2591 /*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2492 2626
2493static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2494{ 2628{
2495 struct sched_domain *sd = this_rq->sd; 2629 struct sched_domain *tmp, *sd = NULL;
2496 cpumask_t sibling_map; 2630 cpumask_t sibling_map;
2497 prio_array_t *array; 2631 prio_array_t *array;
2498 int ret = 0, i; 2632 int ret = 0, i;
2499 task_t *p; 2633 task_t *p;
2500 2634
2501 if (!(sd->flags & SD_SHARE_CPUPOWER)) 2635 for_each_domain(this_cpu, tmp)
2636 if (tmp->flags & SD_SHARE_CPUPOWER)
2637 sd = tmp;
2638
2639 if (!sd)
2502 return 0; 2640 return 0;
2503 2641
2504 /* 2642 /*
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val)
2576 /* 2714 /*
2577 * Underflow? 2715 * Underflow?
2578 */ 2716 */
2579 BUG_ON(((int)preempt_count() < 0)); 2717 BUG_ON((preempt_count() < 0));
2580 preempt_count() += val; 2718 preempt_count() += val;
2581 /* 2719 /*
2582 * Spinlock count overflowing soon? 2720 * Spinlock count overflowing soon?
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
2613 struct list_head *queue; 2751 struct list_head *queue;
2614 unsigned long long now; 2752 unsigned long long now;
2615 unsigned long run_time; 2753 unsigned long run_time;
2616 int cpu, idx; 2754 int cpu, idx, new_prio;
2617 2755
2618 /* 2756 /*
2619 * Test if we are atomic. Since do_exit() needs to call into 2757 * Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
2735 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 2873 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
2736 2874
2737 array = next->array; 2875 array = next->array;
2738 dequeue_task(next, array); 2876 new_prio = recalc_task_prio(next, next->timestamp + delta);
2739 recalc_task_prio(next, next->timestamp + delta); 2877
2740 enqueue_task(next, array); 2878 if (unlikely(next->prio != new_prio)) {
2879 dequeue_task(next, array);
2880 next->prio = new_prio;
2881 enqueue_task(next, array);
2882 } else
2883 requeue_task(next, array);
2741 } 2884 }
2742 next->activated = 0; 2885 next->activated = 0;
2743switch_tasks: 2886switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
2761 rq->curr = next; 2904 rq->curr = next;
2762 ++*switch_count; 2905 ++*switch_count;
2763 2906
2764 prepare_arch_switch(rq, next); 2907 prepare_task_switch(rq, next);
2765 prev = context_switch(rq, prev, next); 2908 prev = context_switch(rq, prev, next);
2766 barrier(); 2909 barrier();
2767 2910 /*
2768 finish_task_switch(prev); 2911 * this_rq must be evaluated again because prev may have moved
2912 * CPUs since it called schedule(), thus the 'rq' on its stack
2913 * frame will be invalid.
2914 */
2915 finish_task_switch(this_rq(), prev);
2769 } else 2916 } else
2770 spin_unlock_irq(&rq->lock); 2917 spin_unlock_irq(&rq->lock);
2771 2918
@@ -2869,7 +3016,7 @@ need_resched:
2869 3016
2870int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
2871{ 3018{
2872 task_t *p = curr->task; 3019 task_t *p = curr->private;
2873 return try_to_wake_up(p, mode, sync); 3020 return try_to_wake_up(p, mode, sync);
2874} 3021}
2875 3022
@@ -3231,8 +3378,8 @@ EXPORT_SYMBOL(set_user_nice);
3231 */ 3378 */
3232int can_nice(const task_t *p, const int nice) 3379int can_nice(const task_t *p, const int nice)
3233{ 3380{
3234 /* convert nice value [19,-20] to rlimit style value [0,39] */ 3381 /* convert nice value [19,-20] to rlimit style value [1,40] */
3235 int nice_rlim = 19 - nice; 3382 int nice_rlim = 20 - nice;
3236 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3383 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3237 capable(CAP_SYS_NICE)); 3384 capable(CAP_SYS_NICE));
3238} 3385}
@@ -3301,15 +3448,7 @@ int task_nice(const task_t *p)
3301{ 3448{
3302 return TASK_NICE(p); 3449 return TASK_NICE(p);
3303} 3450}
3304
3305/*
3306 * The only users of task_nice are binfmt_elf and binfmt_elf32.
3307 * binfmt_elf is no longer modular, but binfmt_elf32 still is.
3308 * Therefore, task_nice is needed if there is a compat_mode.
3309 */
3310#ifdef CONFIG_COMPAT
3311EXPORT_SYMBOL_GPL(task_nice); 3451EXPORT_SYMBOL_GPL(task_nice);
3312#endif
3313 3452
3314/** 3453/**
3315 * idle_cpu - is a given cpu idle currently? 3454 * idle_cpu - is a given cpu idle currently?
@@ -3347,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3347 p->policy = policy; 3486 p->policy = policy;
3348 p->rt_priority = prio; 3487 p->rt_priority = prio;
3349 if (policy != SCHED_NORMAL) 3488 if (policy != SCHED_NORMAL)
3350 p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; 3489 p->prio = MAX_RT_PRIO-1 - p->rt_priority;
3351 else 3490 else
3352 p->prio = p->static_prio; 3491 p->prio = p->static_prio;
3353} 3492}
@@ -3379,18 +3518,31 @@ recheck:
3379 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3518 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3380 */ 3519 */
3381 if (param->sched_priority < 0 || 3520 if (param->sched_priority < 0 ||
3382 param->sched_priority > MAX_USER_RT_PRIO-1) 3521 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3522 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3383 return -EINVAL; 3523 return -EINVAL;
3384 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3524 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
3385 return -EINVAL; 3525 return -EINVAL;
3386 3526
3387 if ((policy == SCHED_FIFO || policy == SCHED_RR) && 3527 /*
3388 param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && 3528 * Allow unprivileged RT tasks to decrease priority:
3389 !capable(CAP_SYS_NICE)) 3529 */
3390 return -EPERM; 3530 if (!capable(CAP_SYS_NICE)) {
3391 if ((current->euid != p->euid) && (current->euid != p->uid) && 3531 /* can't change policy */
3392 !capable(CAP_SYS_NICE)) 3532 if (policy != p->policy &&
3393 return -EPERM; 3533 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3534 return -EPERM;
3535 /* can't increase priority */
3536 if (policy != SCHED_NORMAL &&
3537 param->sched_priority > p->rt_priority &&
3538 param->sched_priority >
3539 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
3540 return -EPERM;
3541 /* can't change other user's priorities */
3542 if ((current->euid != p->euid) &&
3543 (current->euid != p->uid))
3544 return -EPERM;
3545 }
3394 3546
3395 retval = security_task_setscheduler(p, policy, param); 3547 retval = security_task_setscheduler(p, policy, param);
3396 if (retval) 3548 if (retval)
@@ -3727,6 +3879,13 @@ asmlinkage long sys_sched_yield(void)
3727 3879
3728static inline void __cond_resched(void) 3880static inline void __cond_resched(void)
3729{ 3881{
3882 /*
3883 * The BKS might be reacquired before we have dropped
3884 * PREEMPT_ACTIVE, which could trigger a second
3885 * cond_resched() call.
3886 */
3887 if (unlikely(preempt_count()))
3888 return;
3730 do { 3889 do {
3731 add_preempt_count(PREEMPT_ACTIVE); 3890 add_preempt_count(PREEMPT_ACTIVE);
3732 schedule(); 3891 schedule();
@@ -4016,6 +4175,14 @@ void show_state(void)
4016 read_unlock(&tasklist_lock); 4175 read_unlock(&tasklist_lock);
4017} 4176}
4018 4177
4178/**
4179 * init_idle - set up an idle thread for a given CPU
4180 * @idle: task in question
4181 * @cpu: cpu the idle task belongs to
4182 *
4183 * NOTE: this function does not set the idle thread's NEED_RESCHED
4184 * flag, to make booting more robust.
4185 */
4019void __devinit init_idle(task_t *idle, int cpu) 4186void __devinit init_idle(task_t *idle, int cpu)
4020{ 4187{
4021 runqueue_t *rq = cpu_rq(cpu); 4188 runqueue_t *rq = cpu_rq(cpu);
@@ -4030,7 +4197,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4030 4197
4031 spin_lock_irqsave(&rq->lock, flags); 4198 spin_lock_irqsave(&rq->lock, flags);
4032 rq->curr = rq->idle = idle; 4199 rq->curr = rq->idle = idle;
4033 set_tsk_need_resched(idle); 4200#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4201 idle->oncpu = 1;
4202#endif
4034 spin_unlock_irqrestore(&rq->lock, flags); 4203 spin_unlock_irqrestore(&rq->lock, flags);
4035 4204
4036 /* Set the preempt count _outside_ the spinlocks! */ 4205 /* Set the preempt count _outside_ the spinlocks! */
@@ -4174,8 +4343,7 @@ static int migration_thread(void * data)
4174 struct list_head *head; 4343 struct list_head *head;
4175 migration_req_t *req; 4344 migration_req_t *req;
4176 4345
4177 if (current->flags & PF_FREEZE) 4346 try_to_freeze();
4178 refrigerator(PF_FREEZE);
4179 4347
4180 spin_lock_irq(&rq->lock); 4348 spin_lock_irq(&rq->lock);
4181 4349
@@ -4200,17 +4368,9 @@ static int migration_thread(void * data)
4200 req = list_entry(head->next, migration_req_t, list); 4368 req = list_entry(head->next, migration_req_t, list);
4201 list_del_init(head->next); 4369 list_del_init(head->next);
4202 4370
4203 if (req->type == REQ_MOVE_TASK) { 4371 spin_unlock(&rq->lock);
4204 spin_unlock(&rq->lock); 4372 __migrate_task(req->task, cpu, req->dest_cpu);
4205 __migrate_task(req->task, cpu, req->dest_cpu); 4373 local_irq_enable();
4206 local_irq_enable();
4207 } else if (req->type == REQ_SET_DOMAIN) {
4208 rq->sd = req->sd;
4209 spin_unlock_irq(&rq->lock);
4210 } else {
4211 spin_unlock_irq(&rq->lock);
4212 WARN_ON(1);
4213 }
4214 4374
4215 complete(&req->done); 4375 complete(&req->done);
4216 } 4376 }
@@ -4441,7 +4601,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4441 migration_req_t *req; 4601 migration_req_t *req;
4442 req = list_entry(rq->migration_queue.next, 4602 req = list_entry(rq->migration_queue.next,
4443 migration_req_t, list); 4603 migration_req_t, list);
4444 BUG_ON(req->type != REQ_MOVE_TASK);
4445 list_del_init(&req->list); 4604 list_del_init(&req->list);
4446 complete(&req->done); 4605 complete(&req->done);
4447 } 4606 }
@@ -4472,12 +4631,17 @@ int __init migration_init(void)
4472#endif 4631#endif
4473 4632
4474#ifdef CONFIG_SMP 4633#ifdef CONFIG_SMP
4475#define SCHED_DOMAIN_DEBUG 4634#undef SCHED_DOMAIN_DEBUG
4476#ifdef SCHED_DOMAIN_DEBUG 4635#ifdef SCHED_DOMAIN_DEBUG
4477static void sched_domain_debug(struct sched_domain *sd, int cpu) 4636static void sched_domain_debug(struct sched_domain *sd, int cpu)
4478{ 4637{
4479 int level = 0; 4638 int level = 0;
4480 4639
4640 if (!sd) {
4641 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
4642 return;
4643 }
4644
4481 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 4645 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
4482 4646
4483 do { 4647 do {
@@ -4560,37 +4724,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4560#define sched_domain_debug(sd, cpu) {} 4724#define sched_domain_debug(sd, cpu) {}
4561#endif 4725#endif
4562 4726
4727static int sd_degenerate(struct sched_domain *sd)
4728{
4729 if (cpus_weight(sd->span) == 1)
4730 return 1;
4731
4732 /* Following flags need at least 2 groups */
4733 if (sd->flags & (SD_LOAD_BALANCE |
4734 SD_BALANCE_NEWIDLE |
4735 SD_BALANCE_FORK |
4736 SD_BALANCE_EXEC)) {
4737 if (sd->groups != sd->groups->next)
4738 return 0;
4739 }
4740
4741 /* Following flags don't use groups */
4742 if (sd->flags & (SD_WAKE_IDLE |
4743 SD_WAKE_AFFINE |
4744 SD_WAKE_BALANCE))
4745 return 0;
4746
4747 return 1;
4748}
4749
4750static int sd_parent_degenerate(struct sched_domain *sd,
4751 struct sched_domain *parent)
4752{
4753 unsigned long cflags = sd->flags, pflags = parent->flags;
4754
4755 if (sd_degenerate(parent))
4756 return 1;
4757
4758 if (!cpus_equal(sd->span, parent->span))
4759 return 0;
4760
4761 /* Does parent contain flags not in child? */
4762 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
4763 if (cflags & SD_WAKE_AFFINE)
4764 pflags &= ~SD_WAKE_BALANCE;
4765 /* Flags needing groups don't count if only 1 group in parent */
4766 if (parent->groups == parent->groups->next) {
4767 pflags &= ~(SD_LOAD_BALANCE |
4768 SD_BALANCE_NEWIDLE |
4769 SD_BALANCE_FORK |
4770 SD_BALANCE_EXEC);
4771 }
4772 if (~cflags & pflags)
4773 return 0;
4774
4775 return 1;
4776}
4777
4563/* 4778/*
4564 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4565 * hold the hotplug lock. 4780 * hold the hotplug lock.
4566 */ 4781 */
4567void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) 4782void cpu_attach_domain(struct sched_domain *sd, int cpu)
4568{ 4783{
4569 migration_req_t req;
4570 unsigned long flags;
4571 runqueue_t *rq = cpu_rq(cpu); 4784 runqueue_t *rq = cpu_rq(cpu);
4572 int local = 1; 4785 struct sched_domain *tmp;
4573
4574 sched_domain_debug(sd, cpu);
4575
4576 spin_lock_irqsave(&rq->lock, flags);
4577 4786
4578 if (cpu == smp_processor_id() || !cpu_online(cpu)) { 4787 /* Remove the sched domains which do not contribute to scheduling. */
4579 rq->sd = sd; 4788 for (tmp = sd; tmp; tmp = tmp->parent) {
4580 } else { 4789 struct sched_domain *parent = tmp->parent;
4581 init_completion(&req.done); 4790 if (!parent)
4582 req.type = REQ_SET_DOMAIN; 4791 break;
4583 req.sd = sd; 4792 if (sd_parent_degenerate(tmp, parent))
4584 list_add(&req.list, &rq->migration_queue); 4793 tmp->parent = parent->parent;
4585 local = 0;
4586 } 4794 }
4587 4795
4588 spin_unlock_irqrestore(&rq->lock, flags); 4796 if (sd && sd_degenerate(sd))
4797 sd = sd->parent;
4589 4798
4590 if (!local) { 4799 sched_domain_debug(sd, cpu);
4591 wake_up_process(rq->migration_thread); 4800
4592 wait_for_completion(&req.done); 4801 rcu_assign_pointer(rq->sd, sd);
4593 }
4594} 4802}
4595 4803
4596/* cpus with isolated domains */ 4804/* cpus with isolated domains */
@@ -4622,7 +4830,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
4622 * covered by the given span, and will set each group's ->cpumask correctly, 4830 * covered by the given span, and will set each group's ->cpumask correctly,
4623 * and ->cpu_power to 0. 4831 * and ->cpu_power to 0.
4624 */ 4832 */
4625void __devinit init_sched_build_groups(struct sched_group groups[], 4833void init_sched_build_groups(struct sched_group groups[],
4626 cpumask_t span, int (*group_fn)(int cpu)) 4834 cpumask_t span, int (*group_fn)(int cpu))
4627{ 4835{
4628 struct sched_group *first = NULL, *last = NULL; 4836 struct sched_group *first = NULL, *last = NULL;
@@ -4658,13 +4866,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
4658 4866
4659 4867
4660#ifdef ARCH_HAS_SCHED_DOMAIN 4868#ifdef ARCH_HAS_SCHED_DOMAIN
4661extern void __devinit arch_init_sched_domains(void); 4869extern void build_sched_domains(const cpumask_t *cpu_map);
4662extern void __devinit arch_destroy_sched_domains(void); 4870extern void arch_init_sched_domains(const cpumask_t *cpu_map);
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
4663#else 4872#else
4664#ifdef CONFIG_SCHED_SMT 4873#ifdef CONFIG_SCHED_SMT
4665static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4666static struct sched_group sched_group_cpus[NR_CPUS]; 4875static struct sched_group sched_group_cpus[NR_CPUS];
4667static int __devinit cpu_to_cpu_group(int cpu) 4876static int cpu_to_cpu_group(int cpu)
4668{ 4877{
4669 return cpu; 4878 return cpu;
4670} 4879}
@@ -4672,7 +4881,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
4672 4881
4673static DEFINE_PER_CPU(struct sched_domain, phys_domains); 4882static DEFINE_PER_CPU(struct sched_domain, phys_domains);
4674static struct sched_group sched_group_phys[NR_CPUS]; 4883static struct sched_group sched_group_phys[NR_CPUS];
4675static int __devinit cpu_to_phys_group(int cpu) 4884static int cpu_to_phys_group(int cpu)
4676{ 4885{
4677#ifdef CONFIG_SCHED_SMT 4886#ifdef CONFIG_SCHED_SMT
4678 return first_cpu(cpu_sibling_map[cpu]); 4887 return first_cpu(cpu_sibling_map[cpu]);
@@ -4685,7 +4894,7 @@ static int __devinit cpu_to_phys_group(int cpu)
4685 4894
4686static DEFINE_PER_CPU(struct sched_domain, node_domains); 4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4687static struct sched_group sched_group_nodes[MAX_NUMNODES]; 4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4688static int __devinit cpu_to_node_group(int cpu) 4897static int cpu_to_node_group(int cpu)
4689{ 4898{
4690 return cpu_to_node(cpu); 4899 return cpu_to_node(cpu);
4691} 4900}
@@ -4716,39 +4925,28 @@ static void check_sibling_maps(void)
4716#endif 4925#endif
4717 4926
4718/* 4927/*
4719 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 4928 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus
4720 */ 4930 */
4721static void __devinit arch_init_sched_domains(void) 4931static void build_sched_domains(const cpumask_t *cpu_map)
4722{ 4932{
4723 int i; 4933 int i;
4724 cpumask_t cpu_default_map;
4725
4726#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4727 check_sibling_maps();
4728#endif
4729 /*
4730 * Setup mask for cpus without special case scheduling requirements.
4731 * For now this just excludes isolated cpus, but could be used to
4732 * exclude other special cases in the future.
4733 */
4734 cpus_complement(cpu_default_map, cpu_isolated_map);
4735 cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
4736 4934
4737 /* 4935 /*
4738 * Set up domains. Isolated domains just stay on the dummy domain. 4936 * Set up domains for cpus specified by the cpu_map.
4739 */ 4937 */
4740 for_each_cpu_mask(i, cpu_default_map) { 4938 for_each_cpu_mask(i, *cpu_map) {
4741 int group; 4939 int group;
4742 struct sched_domain *sd = NULL, *p; 4940 struct sched_domain *sd = NULL, *p;
4743 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 4941 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
4744 4942
4745 cpus_and(nodemask, nodemask, cpu_default_map); 4943 cpus_and(nodemask, nodemask, *cpu_map);
4746 4944
4747#ifdef CONFIG_NUMA 4945#ifdef CONFIG_NUMA
4748 sd = &per_cpu(node_domains, i); 4946 sd = &per_cpu(node_domains, i);
4749 group = cpu_to_node_group(i); 4947 group = cpu_to_node_group(i);
4750 *sd = SD_NODE_INIT; 4948 *sd = SD_NODE_INIT;
4751 sd->span = cpu_default_map; 4949 sd->span = *cpu_map;
4752 sd->groups = &sched_group_nodes[group]; 4950 sd->groups = &sched_group_nodes[group];
4753#endif 4951#endif
4754 4952
@@ -4766,7 +4964,7 @@ static void __devinit arch_init_sched_domains(void)
4766 group = cpu_to_cpu_group(i); 4964 group = cpu_to_cpu_group(i);
4767 *sd = SD_SIBLING_INIT; 4965 *sd = SD_SIBLING_INIT;
4768 sd->span = cpu_sibling_map[i]; 4966 sd->span = cpu_sibling_map[i];
4769 cpus_and(sd->span, sd->span, cpu_default_map); 4967 cpus_and(sd->span, sd->span, *cpu_map);
4770 sd->parent = p; 4968 sd->parent = p;
4771 sd->groups = &sched_group_cpus[group]; 4969 sd->groups = &sched_group_cpus[group];
4772#endif 4970#endif
@@ -4776,7 +4974,7 @@ static void __devinit arch_init_sched_domains(void)
4776 /* Set up CPU (sibling) groups */ 4974 /* Set up CPU (sibling) groups */
4777 for_each_online_cpu(i) { 4975 for_each_online_cpu(i) {
4778 cpumask_t this_sibling_map = cpu_sibling_map[i]; 4976 cpumask_t this_sibling_map = cpu_sibling_map[i];
4779 cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); 4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4780 if (i != first_cpu(this_sibling_map)) 4978 if (i != first_cpu(this_sibling_map))
4781 continue; 4979 continue;
4782 4980
@@ -4789,7 +4987,7 @@ static void __devinit arch_init_sched_domains(void)
4789 for (i = 0; i < MAX_NUMNODES; i++) { 4987 for (i = 0; i < MAX_NUMNODES; i++) {
4790 cpumask_t nodemask = node_to_cpumask(i); 4988 cpumask_t nodemask = node_to_cpumask(i);
4791 4989
4792 cpus_and(nodemask, nodemask, cpu_default_map); 4990 cpus_and(nodemask, nodemask, *cpu_map);
4793 if (cpus_empty(nodemask)) 4991 if (cpus_empty(nodemask))
4794 continue; 4992 continue;
4795 4993
@@ -4799,12 +4997,12 @@ static void __devinit arch_init_sched_domains(void)
4799 4997
4800#ifdef CONFIG_NUMA 4998#ifdef CONFIG_NUMA
4801 /* Set up node groups */ 4999 /* Set up node groups */
4802 init_sched_build_groups(sched_group_nodes, cpu_default_map, 5000 init_sched_build_groups(sched_group_nodes, *cpu_map,
4803 &cpu_to_node_group); 5001 &cpu_to_node_group);
4804#endif 5002#endif
4805 5003
4806 /* Calculate CPU power for physical packages and nodes */ 5004 /* Calculate CPU power for physical packages and nodes */
4807 for_each_cpu_mask(i, cpu_default_map) { 5005 for_each_cpu_mask(i, *cpu_map) {
4808 int power; 5006 int power;
4809 struct sched_domain *sd; 5007 struct sched_domain *sd;
4810#ifdef CONFIG_SCHED_SMT 5008#ifdef CONFIG_SCHED_SMT
@@ -4828,7 +5026,7 @@ static void __devinit arch_init_sched_domains(void)
4828 } 5026 }
4829 5027
4830 /* Attach the domains */ 5028 /* Attach the domains */
4831 for_each_online_cpu(i) { 5029 for_each_cpu_mask(i, *cpu_map) {
4832 struct sched_domain *sd; 5030 struct sched_domain *sd;
4833#ifdef CONFIG_SCHED_SMT 5031#ifdef CONFIG_SCHED_SMT
4834 sd = &per_cpu(cpu_domains, i); 5032 sd = &per_cpu(cpu_domains, i);
@@ -4838,41 +5036,85 @@ static void __devinit arch_init_sched_domains(void)
4838 cpu_attach_domain(sd, i); 5036 cpu_attach_domain(sd, i);
4839 } 5037 }
4840} 5038}
5039/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map)
5043{
5044 cpumask_t cpu_default_map;
4841 5045
4842#ifdef CONFIG_HOTPLUG_CPU 5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4843static void __devinit arch_destroy_sched_domains(void) 5047 check_sibling_maps();
5048#endif
5049 /*
5050 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to
5052 * exclude other special cases in the future.
5053 */
5054 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5055
5056 build_sched_domains(&cpu_default_map);
5057}
5058
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
4844{ 5060{
4845 /* Do nothing: everything is statically allocated. */ 5061 /* Do nothing: everything is statically allocated. */
4846} 5062}
4847#endif
4848 5063
4849#endif /* ARCH_HAS_SCHED_DOMAIN */ 5064#endif /* ARCH_HAS_SCHED_DOMAIN */
4850 5065
4851/* 5066/*
4852 * Initial dummy domain for early boot and for hotplug cpu. Being static, 5067 * Detach sched domains from a group of cpus specified in cpu_map
4853 * it is initialized to zero, so all balancing flags are cleared which is 5068 * These cpus will now be attached to the NULL domain
4854 * what we want.
4855 */ 5069 */
4856static struct sched_domain sched_domain_dummy; 5070static inline void detach_destroy_domains(const cpumask_t *cpu_map)
5071{
5072 int i;
5073
5074 for_each_cpu_mask(i, *cpu_map)
5075 cpu_attach_domain(NULL, i);
5076 synchronize_sched();
5077 arch_destroy_sched_domains(cpu_map);
5078}
5079
5080/*
5081 * Partition sched domains as specified by the cpumasks below.
5082 * This attaches all cpus from the cpumasks to the NULL domain,
5083 * waits for a RCU quiescent period, recalculates sched
5084 * domain information and then attaches them back to the
5085 * correct sched domains
5086 * Call with hotplug lock held
5087 */
5088void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
5089{
5090 cpumask_t change_map;
5091
5092 cpus_and(*partition1, *partition1, cpu_online_map);
5093 cpus_and(*partition2, *partition2, cpu_online_map);
5094 cpus_or(change_map, *partition1, *partition2);
5095
5096 /* Detach sched domains from all of the affected cpus */
5097 detach_destroy_domains(&change_map);
5098 if (!cpus_empty(*partition1))
5099 build_sched_domains(partition1);
5100 if (!cpus_empty(*partition2))
5101 build_sched_domains(partition2);
5102}
4857 5103
4858#ifdef CONFIG_HOTPLUG_CPU 5104#ifdef CONFIG_HOTPLUG_CPU
4859/* 5105/*
4860 * Force a reinitialization of the sched domains hierarchy. The domains 5106 * Force a reinitialization of the sched domains hierarchy. The domains
4861 * and groups cannot be updated in place without racing with the balancing 5107 * and groups cannot be updated in place without racing with the balancing
4862 * code, so we temporarily attach all running cpus to a "dummy" domain 5108 * code, so we temporarily attach all running cpus to the NULL domain
4863 * which will prevent rebalancing while the sched domains are recalculated. 5109 * which will prevent rebalancing while the sched domains are recalculated.
4864 */ 5110 */
4865static int update_sched_domains(struct notifier_block *nfb, 5111static int update_sched_domains(struct notifier_block *nfb,
4866 unsigned long action, void *hcpu) 5112 unsigned long action, void *hcpu)
4867{ 5113{
4868 int i;
4869
4870 switch (action) { 5114 switch (action) {
4871 case CPU_UP_PREPARE: 5115 case CPU_UP_PREPARE:
4872 case CPU_DOWN_PREPARE: 5116 case CPU_DOWN_PREPARE:
4873 for_each_online_cpu(i) 5117 detach_destroy_domains(&cpu_online_map);
4874 cpu_attach_domain(&sched_domain_dummy, i);
4875 arch_destroy_sched_domains();
4876 return NOTIFY_OK; 5118 return NOTIFY_OK;
4877 5119
4878 case CPU_UP_CANCELED: 5120 case CPU_UP_CANCELED:
@@ -4888,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4888 } 5130 }
4889 5131
4890 /* The hotplug lock is already held by cpu_up/cpu_down */ 5132 /* The hotplug lock is already held by cpu_up/cpu_down */
4891 arch_init_sched_domains(); 5133 arch_init_sched_domains(&cpu_online_map);
4892 5134
4893 return NOTIFY_OK; 5135 return NOTIFY_OK;
4894} 5136}
@@ -4897,7 +5139,7 @@ static int update_sched_domains(struct notifier_block *nfb,
4897void __init sched_init_smp(void) 5139void __init sched_init_smp(void)
4898{ 5140{
4899 lock_cpu_hotplug(); 5141 lock_cpu_hotplug();
4900 arch_init_sched_domains(); 5142 arch_init_sched_domains(&cpu_online_map);
4901 unlock_cpu_hotplug(); 5143 unlock_cpu_hotplug();
4902 /* XXX: Theoretical race here - CPU may be hotplugged now */ 5144 /* XXX: Theoretical race here - CPU may be hotplugged now */
4903 hotcpu_notifier(update_sched_domains, 0); 5145 hotcpu_notifier(update_sched_domains, 0);
@@ -4927,13 +5169,15 @@ void __init sched_init(void)
4927 5169
4928 rq = cpu_rq(i); 5170 rq = cpu_rq(i);
4929 spin_lock_init(&rq->lock); 5171 spin_lock_init(&rq->lock);
5172 rq->nr_running = 0;
4930 rq->active = rq->arrays; 5173 rq->active = rq->arrays;
4931 rq->expired = rq->arrays + 1; 5174 rq->expired = rq->arrays + 1;
4932 rq->best_expired_prio = MAX_PRIO; 5175 rq->best_expired_prio = MAX_PRIO;
4933 5176
4934#ifdef CONFIG_SMP 5177#ifdef CONFIG_SMP
4935 rq->sd = &sched_domain_dummy; 5178 rq->sd = NULL;
4936 rq->cpu_load = 0; 5179 for (j = 1; j < 3; j++)
5180 rq->cpu_load[j] = 0;
4937 rq->active_balance = 0; 5181 rq->active_balance = 0;
4938 rq->push_cpu = 0; 5182 rq->push_cpu = 0;
4939 rq->migration_thread = NULL; 5183 rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index c89821b69ae3..d282fea81138 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
213fastcall void recalc_sigpending_tsk(struct task_struct *t) 213fastcall void recalc_sigpending_tsk(struct task_struct *t)
214{ 214{
215 if (t->signal->group_stop_count > 0 || 215 if (t->signal->group_stop_count > 0 ||
216 (freezing(t)) ||
216 PENDING(&t->pending, &t->blocked) || 217 PENDING(&t->pending, &t->blocked) ||
217 PENDING(&t->signal->shared_pending, &t->blocked)) 218 PENDING(&t->signal->shared_pending, &t->blocked))
218 set_tsk_thread_flag(t, TIF_SIGPENDING); 219 set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -691,7 +692,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
691{ 692{
692 struct task_struct *t; 693 struct task_struct *t;
693 694
694 if (p->flags & SIGNAL_GROUP_EXIT) 695 if (p->signal->flags & SIGNAL_GROUP_EXIT)
695 /* 696 /*
696 * The process is in the middle of dying already. 697 * The process is in the middle of dying already.
697 */ 698 */
@@ -2230,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2230 current->state = TASK_INTERRUPTIBLE; 2231 current->state = TASK_INTERRUPTIBLE;
2231 timeout = schedule_timeout(timeout); 2232 timeout = schedule_timeout(timeout);
2232 2233
2233 if (current->flags & PF_FREEZE) 2234 try_to_freeze();
2234 refrigerator(PF_FREEZE);
2235 spin_lock_irq(&current->sighand->siglock); 2235 spin_lock_irq(&current->sighand->siglock);
2236 sig = dequeue_signal(current, &these, &info); 2236 sig = dequeue_signal(current, &these, &info);
2237 current->blocked = current->real_blocked; 2237 current->blocked = current->real_blocked;
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..0bcaed6560ac 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/highuid.h> 17#include <linux/highuid.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/kexec.h>
19#include <linux/workqueue.h> 21#include <linux/workqueue.h>
20#include <linux/device.h> 22#include <linux/device.h>
21#include <linux/key.h> 23#include <linux/key.h>
@@ -359,6 +361,64 @@ out_unlock:
359 return retval; 361 return retval;
360} 362}
361 363
364void emergency_restart(void)
365{
366 machine_emergency_restart();
367}
368EXPORT_SYMBOL_GPL(emergency_restart);
369
370void kernel_restart(char *cmd)
371{
372 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
373 system_state = SYSTEM_RESTART;
374 device_shutdown();
375 if (!cmd) {
376 printk(KERN_EMERG "Restarting system.\n");
377 } else {
378 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
379 }
380 printk(".\n");
381 machine_restart(cmd);
382}
383EXPORT_SYMBOL_GPL(kernel_restart);
384
385void kernel_kexec(void)
386{
387#ifdef CONFIG_KEXEC
388 struct kimage *image;
389 image = xchg(&kexec_image, 0);
390 if (!image) {
391 return;
392 }
393 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
394 system_state = SYSTEM_RESTART;
395 device_shutdown();
396 printk(KERN_EMERG "Starting new kernel\n");
397 machine_shutdown();
398 machine_kexec(image);
399#endif
400}
401EXPORT_SYMBOL_GPL(kernel_kexec);
402
403void kernel_halt(void)
404{
405 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
406 system_state = SYSTEM_HALT;
407 device_shutdown();
408 printk(KERN_EMERG "System halted.\n");
409 machine_halt();
410}
411EXPORT_SYMBOL_GPL(kernel_halt);
412
413void kernel_power_off(void)
414{
415 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
416 system_state = SYSTEM_POWER_OFF;
417 device_shutdown();
418 printk(KERN_EMERG "Power down.\n");
419 machine_power_off();
420}
421EXPORT_SYMBOL_GPL(kernel_power_off);
362 422
363/* 423/*
364 * Reboot system call: for obvious reasons only root may call it, 424 * Reboot system call: for obvious reasons only root may call it,
@@ -387,11 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
387 lock_kernel(); 447 lock_kernel();
388 switch (cmd) { 448 switch (cmd) {
389 case LINUX_REBOOT_CMD_RESTART: 449 case LINUX_REBOOT_CMD_RESTART:
390 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); 450 kernel_restart(NULL);
391 system_state = SYSTEM_RESTART;
392 device_shutdown();
393 printk(KERN_EMERG "Restarting system.\n");
394 machine_restart(NULL);
395 break; 451 break;
396 452
397 case LINUX_REBOOT_CMD_CAD_ON: 453 case LINUX_REBOOT_CMD_CAD_ON:
@@ -403,21 +459,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
403 break; 459 break;
404 460
405 case LINUX_REBOOT_CMD_HALT: 461 case LINUX_REBOOT_CMD_HALT:
406 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 462 kernel_halt();
407 system_state = SYSTEM_HALT;
408 device_shutdown();
409 printk(KERN_EMERG "System halted.\n");
410 machine_halt();
411 unlock_kernel(); 463 unlock_kernel();
412 do_exit(0); 464 do_exit(0);
413 break; 465 break;
414 466
415 case LINUX_REBOOT_CMD_POWER_OFF: 467 case LINUX_REBOOT_CMD_POWER_OFF:
416 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 468 kernel_power_off();
417 system_state = SYSTEM_POWER_OFF;
418 device_shutdown();
419 printk(KERN_EMERG "Power down.\n");
420 machine_power_off();
421 unlock_kernel(); 469 unlock_kernel();
422 do_exit(0); 470 do_exit(0);
423 break; 471 break;
@@ -429,13 +477,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
429 } 477 }
430 buffer[sizeof(buffer) - 1] = '\0'; 478 buffer[sizeof(buffer) - 1] = '\0';
431 479
432 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); 480 kernel_restart(buffer);
433 system_state = SYSTEM_RESTART;
434 device_shutdown();
435 printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
436 machine_restart(buffer);
437 break; 481 break;
438 482
483 case LINUX_REBOOT_CMD_KEXEC:
484 kernel_kexec();
485 unlock_kernel();
486 return -EINVAL;
487
439#ifdef CONFIG_SOFTWARE_SUSPEND 488#ifdef CONFIG_SOFTWARE_SUSPEND
440 case LINUX_REBOOT_CMD_SW_SUSPEND: 489 case LINUX_REBOOT_CMD_SW_SUSPEND:
441 { 490 {
@@ -455,8 +504,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
455 504
456static void deferred_cad(void *dummy) 505static void deferred_cad(void *dummy)
457{ 506{
458 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); 507 kernel_restart(NULL);
459 machine_restart(NULL);
460} 508}
461 509
462/* 510/*
@@ -525,7 +573,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
525 } 573 }
526 if (new_egid != old_egid) 574 if (new_egid != old_egid)
527 { 575 {
528 current->mm->dumpable = 0; 576 current->mm->dumpable = suid_dumpable;
529 smp_wmb(); 577 smp_wmb();
530 } 578 }
531 if (rgid != (gid_t) -1 || 579 if (rgid != (gid_t) -1 ||
@@ -556,7 +604,7 @@ asmlinkage long sys_setgid(gid_t gid)
556 { 604 {
557 if(old_egid != gid) 605 if(old_egid != gid)
558 { 606 {
559 current->mm->dumpable=0; 607 current->mm->dumpable = suid_dumpable;
560 smp_wmb(); 608 smp_wmb();
561 } 609 }
562 current->gid = current->egid = current->sgid = current->fsgid = gid; 610 current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +613,7 @@ asmlinkage long sys_setgid(gid_t gid)
565 { 613 {
566 if(old_egid != gid) 614 if(old_egid != gid)
567 { 615 {
568 current->mm->dumpable=0; 616 current->mm->dumpable = suid_dumpable;
569 smp_wmb(); 617 smp_wmb();
570 } 618 }
571 current->egid = current->fsgid = gid; 619 current->egid = current->fsgid = gid;
@@ -596,7 +644,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
596 644
597 if(dumpclear) 645 if(dumpclear)
598 { 646 {
599 current->mm->dumpable = 0; 647 current->mm->dumpable = suid_dumpable;
600 smp_wmb(); 648 smp_wmb();
601 } 649 }
602 current->uid = new_ruid; 650 current->uid = new_ruid;
@@ -653,7 +701,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
653 701
654 if (new_euid != old_euid) 702 if (new_euid != old_euid)
655 { 703 {
656 current->mm->dumpable=0; 704 current->mm->dumpable = suid_dumpable;
657 smp_wmb(); 705 smp_wmb();
658 } 706 }
659 current->fsuid = current->euid = new_euid; 707 current->fsuid = current->euid = new_euid;
@@ -703,7 +751,7 @@ asmlinkage long sys_setuid(uid_t uid)
703 751
704 if (old_euid != uid) 752 if (old_euid != uid)
705 { 753 {
706 current->mm->dumpable = 0; 754 current->mm->dumpable = suid_dumpable;
707 smp_wmb(); 755 smp_wmb();
708 } 756 }
709 current->fsuid = current->euid = uid; 757 current->fsuid = current->euid = uid;
@@ -748,7 +796,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
748 if (euid != (uid_t) -1) { 796 if (euid != (uid_t) -1) {
749 if (euid != current->euid) 797 if (euid != current->euid)
750 { 798 {
751 current->mm->dumpable = 0; 799 current->mm->dumpable = suid_dumpable;
752 smp_wmb(); 800 smp_wmb();
753 } 801 }
754 current->euid = euid; 802 current->euid = euid;
@@ -798,7 +846,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
798 if (egid != (gid_t) -1) { 846 if (egid != (gid_t) -1) {
799 if (egid != current->egid) 847 if (egid != current->egid)
800 { 848 {
801 current->mm->dumpable = 0; 849 current->mm->dumpable = suid_dumpable;
802 smp_wmb(); 850 smp_wmb();
803 } 851 }
804 current->egid = egid; 852 current->egid = egid;
@@ -845,7 +893,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
845 { 893 {
846 if (uid != old_fsuid) 894 if (uid != old_fsuid)
847 { 895 {
848 current->mm->dumpable = 0; 896 current->mm->dumpable = suid_dumpable;
849 smp_wmb(); 897 smp_wmb();
850 } 898 }
851 current->fsuid = uid; 899 current->fsuid = uid;
@@ -875,7 +923,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
875 { 923 {
876 if (gid != old_fsgid) 924 if (gid != old_fsgid)
877 { 925 {
878 current->mm->dumpable = 0; 926 current->mm->dumpable = suid_dumpable;
879 smp_wmb(); 927 smp_wmb();
880 } 928 }
881 current->fsgid = gid; 929 current->fsgid = gid;
@@ -894,35 +942,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
894 */ 942 */
895 if (tbuf) { 943 if (tbuf) {
896 struct tms tmp; 944 struct tms tmp;
897 struct task_struct *tsk = current;
898 struct task_struct *t;
899 cputime_t utime, stime, cutime, cstime; 945 cputime_t utime, stime, cutime, cstime;
900 946
901 read_lock(&tasklist_lock); 947#ifdef CONFIG_SMP
902 utime = tsk->signal->utime; 948 if (thread_group_empty(current)) {
903 stime = tsk->signal->stime; 949 /*
904 t = tsk; 950 * Single thread case without the use of any locks.
905 do { 951 *
906 utime = cputime_add(utime, t->utime); 952 * We may race with release_task if two threads are
907 stime = cputime_add(stime, t->stime); 953 * executing. However, release task first adds up the
908 t = next_thread(t); 954 * counters (__exit_signal) before removing the task
909 } while (t != tsk); 955 * from the process tasklist (__unhash_process).
910 956 * __exit_signal also acquires and releases the
911 /* 957 * siglock which results in the proper memory ordering
912 * While we have tasklist_lock read-locked, no dying thread 958 * so that the list modifications are always visible
913 * can be updating current->signal->[us]time. Instead, 959 * after the counters have been updated.
914 * we got their counts included in the live thread loop. 960 *
915 * However, another thread can come in right now and 961 * If the counters have been updated by the second thread
916 * do a wait call that updates current->signal->c[us]time. 962 * but the thread has not yet been removed from the list
917 * To make sure we always see that pair updated atomically, 963 * then the other branch will be executing which will
918 * we take the siglock around fetching them. 964 * block on tasklist_lock until the exit handling of the
919 */ 965 * other task is finished.
920 spin_lock_irq(&tsk->sighand->siglock); 966 *
921 cutime = tsk->signal->cutime; 967 * This also implies that the sighand->siglock cannot
922 cstime = tsk->signal->cstime; 968 * be held by another processor. So we can also
923 spin_unlock_irq(&tsk->sighand->siglock); 969 * skip acquiring that lock.
924 read_unlock(&tasklist_lock); 970 */
971 utime = cputime_add(current->signal->utime, current->utime);
972 stime = cputime_add(current->signal->utime, current->stime);
973 cutime = current->signal->cutime;
974 cstime = current->signal->cstime;
975 } else
976#endif
977 {
978
979 /* Process with multiple threads */
980 struct task_struct *tsk = current;
981 struct task_struct *t;
925 982
983 read_lock(&tasklist_lock);
984 utime = tsk->signal->utime;
985 stime = tsk->signal->stime;
986 t = tsk;
987 do {
988 utime = cputime_add(utime, t->utime);
989 stime = cputime_add(stime, t->stime);
990 t = next_thread(t);
991 } while (t != tsk);
992
993 /*
994 * While we have tasklist_lock read-locked, no dying thread
995 * can be updating current->signal->[us]time. Instead,
996 * we got their counts included in the live thread loop.
997 * However, another thread can come in right now and
998 * do a wait call that updates current->signal->c[us]time.
999 * To make sure we always see that pair updated atomically,
1000 * we take the siglock around fetching them.
1001 */
1002 spin_lock_irq(&tsk->sighand->siglock);
1003 cutime = tsk->signal->cutime;
1004 cstime = tsk->signal->cstime;
1005 spin_unlock_irq(&tsk->sighand->siglock);
1006 read_unlock(&tasklist_lock);
1007 }
926 tmp.tms_utime = cputime_to_clock_t(utime); 1008 tmp.tms_utime = cputime_to_clock_t(utime);
927 tmp.tms_stime = cputime_to_clock_t(stime); 1009 tmp.tms_stime = cputime_to_clock_t(stime);
928 tmp.tms_cutime = cputime_to_clock_t(cutime); 1010 tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1225,7 +1307,7 @@ static void groups_sort(struct group_info *group_info)
1225} 1307}
1226 1308
1227/* a simple bsearch */ 1309/* a simple bsearch */
1228static int groups_search(struct group_info *group_info, gid_t grp) 1310int groups_search(struct group_info *group_info, gid_t grp)
1229{ 1311{
1230 int left, right; 1312 int left, right;
1231 1313
@@ -1652,7 +1734,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1652 error = 1; 1734 error = 1;
1653 break; 1735 break;
1654 case PR_SET_DUMPABLE: 1736 case PR_SET_DUMPABLE:
1655 if (arg2 != 0 && arg2 != 1) { 1737 if (arg2 < 0 || arg2 > 2) {
1656 error = -EINVAL; 1738 error = -EINVAL;
1657 break; 1739 break;
1658 } 1740 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..1ab2370e2efa 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
18cond_syscall(sys_lookup_dcookie); 18cond_syscall(sys_lookup_dcookie);
19cond_syscall(sys_swapon); 19cond_syscall(sys_swapon);
20cond_syscall(sys_swapoff); 20cond_syscall(sys_swapoff);
21cond_syscall(sys_kexec_load);
22cond_syscall(compat_sys_kexec_load);
21cond_syscall(sys_init_module); 23cond_syscall(sys_init_module);
22cond_syscall(sys_delete_module); 24cond_syscall(sys_delete_module);
23cond_syscall(sys_socketpair); 25cond_syscall(sys_socketpair);
@@ -77,7 +79,9 @@ cond_syscall(sys_request_key);
77cond_syscall(sys_keyctl); 79cond_syscall(sys_keyctl);
78cond_syscall(compat_sys_keyctl); 80cond_syscall(compat_sys_keyctl);
79cond_syscall(compat_sys_socketcall); 81cond_syscall(compat_sys_socketcall);
80cond_syscall(sys_set_zone_reclaim); 82cond_syscall(sys_inotify_init);
83cond_syscall(sys_inotify_add_watch);
84cond_syscall(sys_inotify_rm_watch);
81 85
82/* arch-specific weak syscall entries */ 86/* arch-specific weak syscall entries */
83cond_syscall(sys_pciconfig_read); 87cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..3e0bbee549ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
58extern int max_threads; 58extern int max_threads;
59extern int sysrq_enabled; 59extern int sysrq_enabled;
60extern int core_uses_pid; 60extern int core_uses_pid;
61extern int suid_dumpable;
61extern char core_pattern[]; 62extern char core_pattern[];
62extern int cad_pid; 63extern int cad_pid;
63extern int pid_max; 64extern int pid_max;
@@ -113,6 +114,7 @@ extern int unaligned_enabled;
113extern int sysctl_ieee_emulation_warnings; 114extern int sysctl_ieee_emulation_warnings;
114#endif 115#endif
115extern int sysctl_userprocess_debug; 116extern int sysctl_userprocess_debug;
117extern int spin_retry;
116#endif 118#endif
117 119
118extern int sysctl_hz_timer; 120extern int sysctl_hz_timer;
@@ -145,6 +147,9 @@ extern ctl_table random_table[];
145#ifdef CONFIG_UNIX98_PTYS 147#ifdef CONFIG_UNIX98_PTYS
146extern ctl_table pty_table[]; 148extern ctl_table pty_table[];
147#endif 149#endif
150#ifdef CONFIG_INOTIFY
151extern ctl_table inotify_table[];
152#endif
148 153
149#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 154#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
150int sysctl_legacy_va_layout; 155int sysctl_legacy_va_layout;
@@ -217,6 +222,7 @@ static ctl_table root_table[] = {
217 .mode = 0555, 222 .mode = 0555,
218 .child = dev_table, 223 .child = dev_table,
219 }, 224 },
225
220 { .ctl_name = 0 } 226 { .ctl_name = 0 }
221}; 227};
222 228
@@ -642,7 +648,16 @@ static ctl_table kern_table[] = {
642 .mode = 0644, 648 .mode = 0644,
643 .proc_handler = &proc_dointvec, 649 .proc_handler = &proc_dointvec,
644 }, 650 },
645 651#if defined(CONFIG_ARCH_S390)
652 {
653 .ctl_name = KERN_SPIN_RETRY,
654 .procname = "spin_retry",
655 .data = &spin_retry,
656 .maxlen = sizeof (int),
657 .mode = 0644,
658 .proc_handler = &proc_dointvec,
659 },
660#endif
646 { .ctl_name = 0 } 661 { .ctl_name = 0 }
647}; 662};
648 663
@@ -949,7 +964,23 @@ static ctl_table fs_table[] = {
949 .mode = 0644, 964 .mode = 0644,
950 .proc_handler = &proc_dointvec, 965 .proc_handler = &proc_dointvec,
951 }, 966 },
967#ifdef CONFIG_INOTIFY
968 {
969 .ctl_name = FS_INOTIFY,
970 .procname = "inotify",
971 .mode = 0555,
972 .child = inotify_table,
973 },
974#endif
952#endif 975#endif
976 {
977 .ctl_name = KERN_SETUID_DUMPABLE,
978 .procname = "suid_dumpable",
979 .data = &suid_dumpable,
980 .maxlen = sizeof(int),
981 .mode = 0644,
982 .proc_handler = &proc_dointvec,
983 },
953 { .ctl_name = 0 } 984 { .ctl_name = 0 }
954}; 985};
955 986
@@ -959,7 +990,7 @@ static ctl_table debug_table[] = {
959 990
960static ctl_table dev_table[] = { 991static ctl_table dev_table[] = {
961 { .ctl_name = 0 } 992 { .ctl_name = 0 }
962}; 993};
963 994
964extern void init_irq_proc (void); 995extern void init_irq_proc (void);
965 996
@@ -991,8 +1022,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
991 int error = parse_table(name, nlen, oldval, oldlenp, 1022 int error = parse_table(name, nlen, oldval, oldlenp,
992 newval, newlen, head->ctl_table, 1023 newval, newlen, head->ctl_table,
993 &context); 1024 &context);
994 if (context) 1025 kfree(context);
995 kfree(context);
996 if (error != -ENOTDIR) 1026 if (error != -ENOTDIR)
997 return error; 1027 return error;
998 tmp = tmp->next; 1028 tmp = tmp->next;
diff --git a/kernel/time.c b/kernel/time.c
index d4335c1c884c..dd5ae1162a8f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
128 * as real UNIX machines always do it. This avoids all headaches about 128 * as real UNIX machines always do it. This avoids all headaches about
129 * daylight saving times and warping kernel clocks. 129 * daylight saving times and warping kernel clocks.
130 */ 130 */
131inline static void warp_clock(void) 131static inline void warp_clock(void)
132{ 132{
133 write_seqlock_irq(&xtime_lock); 133 write_seqlock_irq(&xtime_lock);
134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 134 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..5377f40723ff 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
57#define TVN_MASK (TVN_SIZE - 1) 57#define TVN_MASK (TVN_SIZE - 1)
58#define TVR_MASK (TVR_SIZE - 1) 58#define TVR_MASK (TVR_SIZE - 1)
59 59
60struct timer_base_s {
61 spinlock_t lock;
62 struct timer_list *running_timer;
63};
64
60typedef struct tvec_s { 65typedef struct tvec_s {
61 struct list_head vec[TVN_SIZE]; 66 struct list_head vec[TVN_SIZE];
62} tvec_t; 67} tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
66} tvec_root_t; 71} tvec_root_t;
67 72
68struct tvec_t_base_s { 73struct tvec_t_base_s {
69 spinlock_t lock; 74 struct timer_base_s t_base;
70 unsigned long timer_jiffies; 75 unsigned long timer_jiffies;
71 struct timer_list *running_timer;
72 tvec_root_t tv1; 76 tvec_root_t tv1;
73 tvec_t tv2; 77 tvec_t tv2;
74 tvec_t tv3; 78 tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
77} ____cacheline_aligned_in_smp; 81} ____cacheline_aligned_in_smp;
78 82
79typedef struct tvec_t_base_s tvec_base_t; 83typedef struct tvec_t_base_s tvec_base_t;
84static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
80 85
81static inline void set_running_timer(tvec_base_t *base, 86static inline void set_running_timer(tvec_base_t *base,
82 struct timer_list *timer) 87 struct timer_list *timer)
83{ 88{
84#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
85 base->running_timer = timer; 90 base->t_base.running_timer = timer;
86#endif 91#endif
87} 92}
88 93
89/* Fake initialization */
90static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
91
92static void check_timer_failed(struct timer_list *timer) 94static void check_timer_failed(struct timer_list *timer)
93{ 95{
94 static int whine_count; 96 static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
103 /* 105 /*
104 * Now fix it up 106 * Now fix it up
105 */ 107 */
106 spin_lock_init(&timer->lock);
107 timer->magic = TIMER_MAGIC; 108 timer->magic = TIMER_MAGIC;
108} 109}
109 110
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
156 list_add_tail(&timer->entry, vec); 157 list_add_tail(&timer->entry, vec);
157} 158}
158 159
160typedef struct timer_base_s timer_base_t;
161/*
162 * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
163 * at compile time, and we need timer->base to lock the timer.
164 */
165timer_base_t __init_timer_base
166 ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
167EXPORT_SYMBOL(__init_timer_base);
168
169/***
170 * init_timer - initialize a timer.
171 * @timer: the timer to be initialized
172 *
173 * init_timer() must be done to a timer prior calling *any* of the
174 * other timer functions.
175 */
176void fastcall init_timer(struct timer_list *timer)
177{
178 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181}
182EXPORT_SYMBOL(init_timer);
183
184static inline void detach_timer(struct timer_list *timer,
185 int clear_pending)
186{
187 struct list_head *entry = &timer->entry;
188
189 __list_del(entry->prev, entry->next);
190 if (clear_pending)
191 entry->next = NULL;
192 entry->prev = LIST_POISON2;
193}
194
195/*
196 * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
197 * means that all timers which are tied to this base via timer->base are
198 * locked, and the base itself is locked too.
199 *
200 * So __run_timers/migrate_timers can safely modify all timers which could
201 * be found on ->tvX lists.
202 *
203 * When the timer's base is locked, and the timer removed from list, it is
204 * possible to set timer->base = NULL and drop the lock: the timer remains
205 * locked.
206 */
207static timer_base_t *lock_timer_base(struct timer_list *timer,
208 unsigned long *flags)
209{
210 timer_base_t *base;
211
212 for (;;) {
213 base = timer->base;
214 if (likely(base != NULL)) {
215 spin_lock_irqsave(&base->lock, *flags);
216 if (likely(base == timer->base))
217 return base;
218 /* The timer has migrated to another CPU */
219 spin_unlock_irqrestore(&base->lock, *flags);
220 }
221 cpu_relax();
222 }
223}
224
159int __mod_timer(struct timer_list *timer, unsigned long expires) 225int __mod_timer(struct timer_list *timer, unsigned long expires)
160{ 226{
161 tvec_base_t *old_base, *new_base; 227 timer_base_t *base;
228 tvec_base_t *new_base;
162 unsigned long flags; 229 unsigned long flags;
163 int ret = 0; 230 int ret = 0;
164 231
165 BUG_ON(!timer->function); 232 BUG_ON(!timer->function);
166
167 check_timer(timer); 233 check_timer(timer);
168 234
169 spin_lock_irqsave(&timer->lock, flags); 235 base = lock_timer_base(timer, &flags);
236
237 if (timer_pending(timer)) {
238 detach_timer(timer, 0);
239 ret = 1;
240 }
241
170 new_base = &__get_cpu_var(tvec_bases); 242 new_base = &__get_cpu_var(tvec_bases);
171repeat:
172 old_base = timer->base;
173 243
174 /* 244 if (base != &new_base->t_base) {
175 * Prevent deadlocks via ordering by old_base < new_base.
176 */
177 if (old_base && (new_base != old_base)) {
178 if (old_base < new_base) {
179 spin_lock(&new_base->lock);
180 spin_lock(&old_base->lock);
181 } else {
182 spin_lock(&old_base->lock);
183 spin_lock(&new_base->lock);
184 }
185 /* 245 /*
186 * The timer base might have been cancelled while we were 246 * We are trying to schedule the timer on the local CPU.
187 * trying to take the lock(s): 247 * However we can't change timer's base while it is running,
248 * otherwise del_timer_sync() can't detect that the timer's
249 * handler yet has not finished. This also guarantees that
250 * the timer is serialized wrt itself.
188 */ 251 */
189 if (timer->base != old_base) { 252 if (unlikely(base->running_timer == timer)) {
190 spin_unlock(&new_base->lock); 253 /* The timer remains on a former base */
191 spin_unlock(&old_base->lock); 254 new_base = container_of(base, tvec_base_t, t_base);
192 goto repeat; 255 } else {
193 } 256 /* See the comment in lock_timer_base() */
194 } else { 257 timer->base = NULL;
195 spin_lock(&new_base->lock); 258 spin_unlock(&base->lock);
196 if (timer->base != old_base) { 259 spin_lock(&new_base->t_base.lock);
197 spin_unlock(&new_base->lock); 260 timer->base = &new_base->t_base;
198 goto repeat;
199 } 261 }
200 } 262 }
201 263
202 /*
203 * Delete the previous timeout (if there was any), and install
204 * the new one:
205 */
206 if (old_base) {
207 list_del(&timer->entry);
208 ret = 1;
209 }
210 timer->expires = expires; 264 timer->expires = expires;
211 internal_add_timer(new_base, timer); 265 internal_add_timer(new_base, timer);
212 timer->base = new_base; 266 spin_unlock_irqrestore(&new_base->t_base.lock, flags);
213
214 if (old_base && (new_base != old_base))
215 spin_unlock(&old_base->lock);
216 spin_unlock(&new_base->lock);
217 spin_unlock_irqrestore(&timer->lock, flags);
218 267
219 return ret; 268 return ret;
220} 269}
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
232{ 281{
233 tvec_base_t *base = &per_cpu(tvec_bases, cpu); 282 tvec_base_t *base = &per_cpu(tvec_bases, cpu);
234 unsigned long flags; 283 unsigned long flags;
235 284
236 BUG_ON(timer_pending(timer) || !timer->function); 285 BUG_ON(timer_pending(timer) || !timer->function);
237 286
238 check_timer(timer); 287 check_timer(timer);
239 288
240 spin_lock_irqsave(&base->lock, flags); 289 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base;
241 internal_add_timer(base, timer); 291 internal_add_timer(base, timer);
242 timer->base = base; 292 spin_unlock_irqrestore(&base->t_base.lock, flags);
243 spin_unlock_irqrestore(&base->lock, flags);
244} 293}
245 294
246 295
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer);
295 */ 344 */
296int del_timer(struct timer_list *timer) 345int del_timer(struct timer_list *timer)
297{ 346{
347 timer_base_t *base;
298 unsigned long flags; 348 unsigned long flags;
299 tvec_base_t *base; 349 int ret = 0;
300 350
301 check_timer(timer); 351 check_timer(timer);
302 352
303repeat: 353 if (timer_pending(timer)) {
304 base = timer->base; 354 base = lock_timer_base(timer, &flags);
305 if (!base) 355 if (timer_pending(timer)) {
306 return 0; 356 detach_timer(timer, 1);
307 spin_lock_irqsave(&base->lock, flags); 357 ret = 1;
308 if (base != timer->base) { 358 }
309 spin_unlock_irqrestore(&base->lock, flags); 359 spin_unlock_irqrestore(&base->lock, flags);
310 goto repeat;
311 } 360 }
312 list_del(&timer->entry);
313 /* Need to make sure that anybody who sees a NULL base also sees the list ops */
314 smp_wmb();
315 timer->base = NULL;
316 spin_unlock_irqrestore(&base->lock, flags);
317 361
318 return 1; 362 return ret;
319} 363}
320 364
321EXPORT_SYMBOL(del_timer); 365EXPORT_SYMBOL(del_timer);
322 366
323#ifdef CONFIG_SMP 367#ifdef CONFIG_SMP
324/*** 368/*
325 * del_timer_sync - deactivate a timer and wait for the handler to finish. 369 * This function tries to deactivate a timer. Upon successful (ret >= 0)
326 * @timer: the timer to be deactivated 370 * exit the timer is not queued and the handler is not running on any CPU.
327 *
328 * This function only differs from del_timer() on SMP: besides deactivating
329 * the timer it also makes sure the handler has finished executing on other
330 * CPUs.
331 *
332 * Synchronization rules: callers must prevent restarting of the timer,
333 * otherwise this function is meaningless. It must not be called from
334 * interrupt contexts. The caller must not hold locks which would prevent
335 * completion of the timer's handler. Upon exit the timer is not queued and
336 * the handler is not running on any CPU.
337 *
338 * The function returns whether it has deactivated a pending timer or not.
339 * 371 *
340 * del_timer_sync() is slow and complicated because it copes with timer 372 * It must not be called from interrupt contexts.
341 * handlers which re-arm the timer (periodic timers). If the timer handler
342 * is known to not do this (a single shot timer) then use
343 * del_singleshot_timer_sync() instead.
344 */ 373 */
345int del_timer_sync(struct timer_list *timer) 374int try_to_del_timer_sync(struct timer_list *timer)
346{ 375{
347 tvec_base_t *base; 376 timer_base_t *base;
348 int i, ret = 0; 377 unsigned long flags;
378 int ret = -1;
349 379
350 check_timer(timer); 380 base = lock_timer_base(timer, &flags);
351 381
352del_again: 382 if (base->running_timer == timer)
353 ret += del_timer(timer); 383 goto out;
354 384
355 for_each_online_cpu(i) { 385 ret = 0;
356 base = &per_cpu(tvec_bases, i); 386 if (timer_pending(timer)) {
357 if (base->running_timer == timer) { 387 detach_timer(timer, 1);
358 while (base->running_timer == timer) { 388 ret = 1;
359 cpu_relax();
360 preempt_check_resched();
361 }
362 break;
363 }
364 } 389 }
365 smp_rmb(); 390out:
366 if (timer_pending(timer)) 391 spin_unlock_irqrestore(&base->lock, flags);
367 goto del_again;
368 392
369 return ret; 393 return ret;
370} 394}
371EXPORT_SYMBOL(del_timer_sync);
372 395
373/*** 396/***
374 * del_singleshot_timer_sync - deactivate a non-recursive timer 397 * del_timer_sync - deactivate a timer and wait for the handler to finish.
375 * @timer: the timer to be deactivated 398 * @timer: the timer to be deactivated
376 * 399 *
377 * This function is an optimization of del_timer_sync for the case where the 400 * This function only differs from del_timer() on SMP: besides deactivating
378 * caller can guarantee the timer does not reschedule itself in its timer 401 * the timer it also makes sure the handler has finished executing on other
379 * function. 402 * CPUs.
380 * 403 *
381 * Synchronization rules: callers must prevent restarting of the timer, 404 * Synchronization rules: callers must prevent restarting of the timer,
382 * otherwise this function is meaningless. It must not be called from 405 * otherwise this function is meaningless. It must not be called from
383 * interrupt contexts. The caller must not hold locks which wold prevent 406 * interrupt contexts. The caller must not hold locks which would prevent
384 * completion of the timer's handler. Upon exit the timer is not queued and 407 * completion of the timer's handler. The timer's handler must not call
385 * the handler is not running on any CPU. 408 * add_timer_on(). Upon exit the timer is not queued and the handler is
409 * not running on any CPU.
386 * 410 *
387 * The function returns whether it has deactivated a pending timer or not. 411 * The function returns whether it has deactivated a pending timer or not.
388 */ 412 */
389int del_singleshot_timer_sync(struct timer_list *timer) 413int del_timer_sync(struct timer_list *timer)
390{ 414{
391 int ret = del_timer(timer); 415 check_timer(timer);
392 416
393 if (!ret) { 417 for (;;) {
394 ret = del_timer_sync(timer); 418 int ret = try_to_del_timer_sync(timer);
395 BUG_ON(ret); 419 if (ret >= 0)
420 return ret;
396 } 421 }
397
398 return ret;
399} 422}
400EXPORT_SYMBOL(del_singleshot_timer_sync); 423
424EXPORT_SYMBOL(del_timer_sync);
401#endif 425#endif
402 426
403static int cascade(tvec_base_t *base, tvec_t *tv, int index) 427static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
415 struct timer_list *tmp; 439 struct timer_list *tmp;
416 440
417 tmp = list_entry(curr, struct timer_list, entry); 441 tmp = list_entry(curr, struct timer_list, entry);
418 BUG_ON(tmp->base != base); 442 BUG_ON(tmp->base != &base->t_base);
419 curr = curr->next; 443 curr = curr->next;
420 internal_add_timer(base, tmp); 444 internal_add_timer(base, tmp);
421 } 445 }
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base)
437{ 461{
438 struct timer_list *timer; 462 struct timer_list *timer;
439 463
440 spin_lock_irq(&base->lock); 464 spin_lock_irq(&base->t_base.lock);
441 while (time_after_eq(jiffies, base->timer_jiffies)) { 465 while (time_after_eq(jiffies, base->timer_jiffies)) {
442 struct list_head work_list = LIST_HEAD_INIT(work_list); 466 struct list_head work_list = LIST_HEAD_INIT(work_list);
443 struct list_head *head = &work_list; 467 struct list_head *head = &work_list;
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base)
453 cascade(base, &base->tv5, INDEX(3)); 477 cascade(base, &base->tv5, INDEX(3));
454 ++base->timer_jiffies; 478 ++base->timer_jiffies;
455 list_splice_init(base->tv1.vec + index, &work_list); 479 list_splice_init(base->tv1.vec + index, &work_list);
456repeat: 480 while (!list_empty(head)) {
457 if (!list_empty(head)) {
458 void (*fn)(unsigned long); 481 void (*fn)(unsigned long);
459 unsigned long data; 482 unsigned long data;
460 483
@@ -462,25 +485,26 @@ repeat:
462 fn = timer->function; 485 fn = timer->function;
463 data = timer->data; 486 data = timer->data;
464 487
465 list_del(&timer->entry);
466 set_running_timer(base, timer); 488 set_running_timer(base, timer);
467 smp_wmb(); 489 detach_timer(timer, 1);
468 timer->base = NULL; 490 spin_unlock_irq(&base->t_base.lock);
469 spin_unlock_irq(&base->lock);
470 { 491 {
471 u32 preempt_count = preempt_count(); 492 int preempt_count = preempt_count();
472 fn(data); 493 fn(data);
473 if (preempt_count != preempt_count()) { 494 if (preempt_count != preempt_count()) {
474 printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); 495 printk(KERN_WARNING "huh, entered %p "
496 "with preempt_count %08x, exited"
497 " with %08x?\n",
498 fn, preempt_count,
499 preempt_count());
475 BUG(); 500 BUG();
476 } 501 }
477 } 502 }
478 spin_lock_irq(&base->lock); 503 spin_lock_irq(&base->t_base.lock);
479 goto repeat;
480 } 504 }
481 } 505 }
482 set_running_timer(base, NULL); 506 set_running_timer(base, NULL);
483 spin_unlock_irq(&base->lock); 507 spin_unlock_irq(&base->t_base.lock);
484} 508}
485 509
486#ifdef CONFIG_NO_IDLE_HZ 510#ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void)
499 int i, j; 523 int i, j;
500 524
501 base = &__get_cpu_var(tvec_bases); 525 base = &__get_cpu_var(tvec_bases);
502 spin_lock(&base->lock); 526 spin_lock(&base->t_base.lock);
503 expires = base->timer_jiffies + (LONG_MAX >> 1); 527 expires = base->timer_jiffies + (LONG_MAX >> 1);
504 list = 0; 528 list = 0;
505 529
@@ -547,7 +571,7 @@ found:
547 expires = nte->expires; 571 expires = nte->expires;
548 } 572 }
549 } 573 }
550 spin_unlock(&base->lock); 574 spin_unlock(&base->t_base.lock);
551 return expires; 575 return expires;
552} 576}
553#endif 577#endif
@@ -999,7 +1023,7 @@ asmlinkage long sys_getppid(void)
999 parent = me->group_leader->real_parent; 1023 parent = me->group_leader->real_parent;
1000 for (;;) { 1024 for (;;) {
1001 pid = parent->tgid; 1025 pid = parent->tgid;
1002#ifdef CONFIG_SMP 1026#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1003{ 1027{
1004 struct task_struct *old = parent; 1028 struct task_struct *old = parent;
1005 1029
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu)
1286{ 1310{
1287 int j; 1311 int j;
1288 tvec_base_t *base; 1312 tvec_base_t *base;
1289 1313
1290 base = &per_cpu(tvec_bases, cpu); 1314 base = &per_cpu(tvec_bases, cpu);
1291 spin_lock_init(&base->lock); 1315 spin_lock_init(&base->t_base.lock);
1292 for (j = 0; j < TVN_SIZE; j++) { 1316 for (j = 0; j < TVN_SIZE; j++) {
1293 INIT_LIST_HEAD(base->tv5.vec + j); 1317 INIT_LIST_HEAD(base->tv5.vec + j);
1294 INIT_LIST_HEAD(base->tv4.vec + j); 1318 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu)
1302} 1326}
1303 1327
1304#ifdef CONFIG_HOTPLUG_CPU 1328#ifdef CONFIG_HOTPLUG_CPU
1305static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1329static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1306{ 1330{
1307 struct timer_list *timer; 1331 struct timer_list *timer;
1308 1332
1309 while (!list_empty(head)) { 1333 while (!list_empty(head)) {
1310 timer = list_entry(head->next, struct timer_list, entry); 1334 timer = list_entry(head->next, struct timer_list, entry);
1311 /* We're locking backwards from __mod_timer order here, 1335 detach_timer(timer, 0);
1312 beware deadlock. */ 1336 timer->base = &new_base->t_base;
1313 if (!spin_trylock(&timer->lock))
1314 return 0;
1315 list_del(&timer->entry);
1316 internal_add_timer(new_base, timer); 1337 internal_add_timer(new_base, timer);
1317 timer->base = new_base;
1318 spin_unlock(&timer->lock);
1319 } 1338 }
1320 return 1;
1321} 1339}
1322 1340
1323static void __devinit migrate_timers(int cpu) 1341static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu)
1331 new_base = &get_cpu_var(tvec_bases); 1349 new_base = &get_cpu_var(tvec_bases);
1332 1350
1333 local_irq_disable(); 1351 local_irq_disable();
1334again: 1352 spin_lock(&new_base->t_base.lock);
1335 /* Prevent deadlocks via ordering by old_base < new_base. */ 1353 spin_lock(&old_base->t_base.lock);
1336 if (old_base < new_base) {
1337 spin_lock(&new_base->lock);
1338 spin_lock(&old_base->lock);
1339 } else {
1340 spin_lock(&old_base->lock);
1341 spin_lock(&new_base->lock);
1342 }
1343 1354
1344 if (old_base->running_timer) 1355 if (old_base->t_base.running_timer)
1345 BUG(); 1356 BUG();
1346 for (i = 0; i < TVR_SIZE; i++) 1357 for (i = 0; i < TVR_SIZE; i++)
1347 if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) 1358 migrate_timer_list(new_base, old_base->tv1.vec + i);
1348 goto unlock_again; 1359 for (i = 0; i < TVN_SIZE; i++) {
1349 for (i = 0; i < TVN_SIZE; i++) 1360 migrate_timer_list(new_base, old_base->tv2.vec + i);
1350 if (!migrate_timer_list(new_base, old_base->tv2.vec + i) 1361 migrate_timer_list(new_base, old_base->tv3.vec + i);
1351 || !migrate_timer_list(new_base, old_base->tv3.vec + i) 1362 migrate_timer_list(new_base, old_base->tv4.vec + i);
1352 || !migrate_timer_list(new_base, old_base->tv4.vec + i) 1363 migrate_timer_list(new_base, old_base->tv5.vec + i);
1353 || !migrate_timer_list(new_base, old_base->tv5.vec + i)) 1364 }
1354 goto unlock_again; 1365
1355 spin_unlock(&old_base->lock); 1366 spin_unlock(&old_base->t_base.lock);
1356 spin_unlock(&new_base->lock); 1367 spin_unlock(&new_base->t_base.lock);
1357 local_irq_enable(); 1368 local_irq_enable();
1358 put_cpu_var(tvec_bases); 1369 put_cpu_var(tvec_bases);
1359 return;
1360
1361unlock_again:
1362 /* Avoid deadlock with __mod_timer, by backing off. */
1363 spin_unlock(&old_base->lock);
1364 spin_unlock(&new_base->lock);
1365 cpu_relax();
1366 goto again;
1367} 1370}
1368#endif /* CONFIG_HOTPLUG_CPU */ 1371#endif /* CONFIG_HOTPLUG_CPU */
1369 1372
@@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs)
1594EXPORT_SYMBOL(msleep); 1597EXPORT_SYMBOL(msleep);
1595 1598
1596/** 1599/**
1597 * msleep_interruptible - sleep waiting for waitqueue interruptions 1600 * msleep_interruptible - sleep waiting for signals
1598 * @msecs: Time in milliseconds to sleep for 1601 * @msecs: Time in milliseconds to sleep for
1599 */ 1602 */
1600unsigned long msleep_interruptible(unsigned int msecs) 1603unsigned long msleep_interruptible(unsigned int msecs)
diff --git a/kernel/user.c b/kernel/user.c
index 734575d55769..89e562feb1b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -120,6 +120,10 @@ struct user_struct * alloc_uid(uid_t uid)
120 atomic_set(&new->processes, 0); 120 atomic_set(&new->processes, 0);
121 atomic_set(&new->files, 0); 121 atomic_set(&new->files, 0);
122 atomic_set(&new->sigpending, 0); 122 atomic_set(&new->sigpending, 0);
123#ifdef CONFIG_INOTIFY
124 atomic_set(&new->inotify_watches, 0);
125 atomic_set(&new->inotify_devs, 0);
126#endif
123 127
124 new->mq_bytes = 0; 128 new->mq_bytes = 0;
125 new->locked_shm = 0; 129 new->locked_shm = 0;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 259cf55da3c9..c7e36d4a70ca 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,8 +308,6 @@ struct workqueue_struct *__create_workqueue(const char *name,
308 struct workqueue_struct *wq; 308 struct workqueue_struct *wq;
309 struct task_struct *p; 309 struct task_struct *p;
310 310
311 BUG_ON(strlen(name) > 10);
312
313 wq = kmalloc(sizeof(*wq), GFP_KERNEL); 311 wq = kmalloc(sizeof(*wq), GFP_KERNEL);
314 if (!wq) 312 if (!wq)
315 return NULL; 313 return NULL;