diff options
Diffstat (limited to 'kernel')
40 files changed, 3315 insertions, 1047 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz new file mode 100644 index 000000000000..248e1c396f8b --- /dev/null +++ b/kernel/Kconfig.hz | |||
@@ -0,0 +1,46 @@ | |||
1 | # | ||
2 | # Timer Interrupt Frequency Configuration | ||
3 | # | ||
4 | |||
5 | choice | ||
6 | prompt "Timer frequency" | ||
7 | default HZ_250 | ||
8 | help | ||
9 | Allows the configuration of the timer frequency. It is customary | ||
10 | to have the timer interrupt run at 1000 HZ but 100 HZ may be more | ||
11 | beneficial for servers and NUMA systems that do not need to have | ||
12 | a fast response for user interaction and that may experience bus | ||
13 | contention and cacheline bounces as a result of timer interrupts. | ||
14 | Note that the timer interrupt occurs on each processor in an SMP | ||
15 | environment leading to NR_CPUS * HZ number of timer interrupts | ||
16 | per second. | ||
17 | |||
18 | |||
19 | config HZ_100 | ||
20 | bool "100 HZ" | ||
21 | help | ||
22 | 100 HZ is a typical choice for servers, SMP and NUMA systems | ||
23 | with lots of processors that may show reduced performance if | ||
24 | too many timer interrupts are occurring. | ||
25 | |||
26 | config HZ_250 | ||
27 | bool "250 HZ" | ||
28 | help | ||
29 | 250 HZ is a good compromise choice allowing server performance | ||
30 | while also showing good interactive responsiveness even | ||
31 | on SMP and NUMA systems. | ||
32 | |||
33 | config HZ_1000 | ||
34 | bool "1000 HZ" | ||
35 | help | ||
36 | 1000 HZ is the preferred choice for desktop systems and other | ||
37 | systems requiring fast interactive responses to events. | ||
38 | |||
39 | endchoice | ||
40 | |||
41 | config HZ | ||
42 | int | ||
43 | default 100 if HZ_100 | ||
44 | default 250 if HZ_250 | ||
45 | default 1000 if HZ_1000 | ||
46 | |||
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt new file mode 100644 index 000000000000..0b46a5dff4c0 --- /dev/null +++ b/kernel/Kconfig.preempt | |||
@@ -0,0 +1,65 @@ | |||
1 | |||
2 | choice | ||
3 | prompt "Preemption Model" | ||
4 | default PREEMPT_NONE | ||
5 | |||
6 | config PREEMPT_NONE | ||
7 | bool "No Forced Preemption (Server)" | ||
8 | help | ||
9 | This is the traditional Linux preemption model, geared towards | ||
10 | throughput. It will still provide good latencies most of the | ||
11 | time, but there are no guarantees and occasional longer delays | ||
12 | are possible. | ||
13 | |||
14 | Select this option if you are building a kernel for a server or | ||
15 | scientific/computation system, or if you want to maximize the | ||
16 | raw processing power of the kernel, irrespective of scheduling | ||
17 | latencies. | ||
18 | |||
19 | config PREEMPT_VOLUNTARY | ||
20 | bool "Voluntary Kernel Preemption (Desktop)" | ||
21 | help | ||
22 | This option reduces the latency of the kernel by adding more | ||
23 | "explicit preemption points" to the kernel code. These new | ||
24 | preemption points have been selected to reduce the maximum | ||
25 | latency of rescheduling, providing faster application reactions, | ||
26 | at the cost of slighly lower throughput. | ||
27 | |||
28 | This allows reaction to interactive events by allowing a | ||
29 | low priority process to voluntarily preempt itself even if it | ||
30 | is in kernel mode executing a system call. This allows | ||
31 | applications to run more 'smoothly' even when the system is | ||
32 | under load. | ||
33 | |||
34 | Select this if you are building a kernel for a desktop system. | ||
35 | |||
36 | config PREEMPT | ||
37 | bool "Preemptible Kernel (Low-Latency Desktop)" | ||
38 | help | ||
39 | This option reduces the latency of the kernel by making | ||
40 | all kernel code (that is not executing in a critical section) | ||
41 | preemptible. This allows reaction to interactive events by | ||
42 | permitting a low priority process to be preempted involuntarily | ||
43 | even if it is in kernel mode executing a system call and would | ||
44 | otherwise not be about to reach a natural preemption point. | ||
45 | This allows applications to run more 'smoothly' even when the | ||
46 | system is under load, at the cost of slighly lower throughput | ||
47 | and a slight runtime overhead to kernel code. | ||
48 | |||
49 | Select this if you are building a kernel for a desktop or | ||
50 | embedded system with latency requirements in the milliseconds | ||
51 | range. | ||
52 | |||
53 | endchoice | ||
54 | |||
55 | config PREEMPT_BKL | ||
56 | bool "Preempt The Big Kernel Lock" | ||
57 | depends on SMP || PREEMPT | ||
58 | default y | ||
59 | help | ||
60 | This option reduces the latency of the kernel by making the | ||
61 | big kernel lock preemptible. | ||
62 | |||
63 | Say Y here if you are building a kernel for a desktop system. | ||
64 | Say N if you are unsure. | ||
65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index b01d26fe8db7..cb05cd05d237 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o | |||
17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
18 | obj-$(CONFIG_PM) += power/ | 18 | obj-$(CONFIG_PM) += power/ |
19 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 19 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
20 | obj-$(CONFIG_KEXEC) += kexec.o | ||
20 | obj-$(CONFIG_COMPAT) += compat.o | 21 | obj-$(CONFIG_COMPAT) += compat.o |
21 | obj-$(CONFIG_CPUSETS) += cpuset.o | 22 | obj-$(CONFIG_CPUSETS) += cpuset.o |
22 | obj-$(CONFIG_IKCONFIG) += configs.o | 23 | obj-$(CONFIG_IKCONFIG) += configs.o |
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | |||
27 | obj-$(CONFIG_KPROBES) += kprobes.o | 28 | obj-$(CONFIG_KPROBES) += kprobes.o |
28 | obj-$(CONFIG_SYSFS) += ksysfs.o | 29 | obj-$(CONFIG_SYSFS) += ksysfs.o |
29 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 30 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
31 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | ||
30 | obj-$(CONFIG_SECCOMP) += seccomp.o | 32 | obj-$(CONFIG_SECCOMP) += seccomp.o |
31 | 33 | ||
32 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 34 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
diff --git a/kernel/capability.c b/kernel/capability.c index 64db1ee820c2..8986a37a67ea 100644 --- a/kernel/capability.c +++ b/kernel/capability.c | |||
@@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock); | |||
31 | * uninteresting and/or not to be changed. | 31 | * uninteresting and/or not to be changed. |
32 | */ | 32 | */ |
33 | 33 | ||
34 | /* | 34 | /** |
35 | * sys_capget - get the capabilities of a given process. | 35 | * sys_capget - get the capabilities of a given process. |
36 | * @header: pointer to struct that contains capability version and | ||
37 | * target pid data | ||
38 | * @dataptr: pointer to struct that contains the effective, permitted, | ||
39 | * and inheritable capabilities that are returned | ||
40 | * | ||
41 | * Returns 0 on success and < 0 on error. | ||
36 | */ | 42 | */ |
37 | asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) | 43 | asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) |
38 | { | 44 | { |
@@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective, | |||
141 | return ret; | 147 | return ret; |
142 | } | 148 | } |
143 | 149 | ||
144 | /* | 150 | /** |
145 | * sys_capset - set capabilities for a given process, all processes, or all | 151 | * sys_capset - set capabilities for a process or a group of processes |
152 | * @header: pointer to struct that contains capability version and | ||
153 | * target pid data | ||
154 | * @data: pointer to struct that contains the effective, permitted, | ||
155 | * and inheritable capabilities | ||
156 | * | ||
157 | * Set capabilities for a given process, all processes, or all | ||
146 | * processes in a given process group. | 158 | * processes in a given process group. |
147 | * | 159 | * |
148 | * The restrictions on setting capabilities are specified as: | 160 | * The restrictions on setting capabilities are specified as: |
@@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective, | |||
152 | * I: any raised capabilities must be a subset of the (old current) permitted | 164 | * I: any raised capabilities must be a subset of the (old current) permitted |
153 | * P: any raised capabilities must be a subset of the (old current) permitted | 165 | * P: any raised capabilities must be a subset of the (old current) permitted |
154 | * E: must be set to a subset of (new target) permitted | 166 | * E: must be set to a subset of (new target) permitted |
167 | * | ||
168 | * Returns 0 on success and < 0 on error. | ||
155 | */ | 169 | */ |
156 | asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) | 170 | asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) |
157 | { | 171 | { |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 628f4ccda127..53d8263ae12e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused) | |||
63 | { | 63 | { |
64 | int err; | 64 | int err; |
65 | 65 | ||
66 | /* Take offline: makes arch_cpu_down somewhat easier. */ | ||
67 | cpu_clear(smp_processor_id(), cpu_online_map); | ||
68 | |||
69 | /* Ensure this CPU doesn't handle any more interrupts. */ | 66 | /* Ensure this CPU doesn't handle any more interrupts. */ |
70 | err = __cpu_disable(); | 67 | err = __cpu_disable(); |
71 | if (err < 0) | 68 | if (err < 0) |
72 | cpu_set(smp_processor_id(), cpu_online_map); | 69 | return err; |
73 | else | ||
74 | /* Force idle task to run as soon as we yield: it should | ||
75 | immediately notice cpu is offline and die quickly. */ | ||
76 | sched_idle_next(); | ||
77 | 70 | ||
78 | return err; | 71 | /* Force idle task to run as soon as we yield: it should |
72 | immediately notice cpu is offline and die quickly. */ | ||
73 | sched_idle_next(); | ||
74 | return 0; | ||
79 | } | 75 | } |
80 | 76 | ||
81 | int cpu_down(unsigned int cpu) | 77 | int cpu_down(unsigned int cpu) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 00e8f2575512..8ab1b4e518b8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = { | |||
228 | 228 | ||
229 | static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) | 229 | static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) |
230 | { | 230 | { |
231 | struct qstr qstr; | 231 | struct dentry *d = lookup_one_len(name, parent, strlen(name)); |
232 | struct dentry *d; | ||
233 | |||
234 | qstr.name = name; | ||
235 | qstr.len = strlen(name); | ||
236 | qstr.hash = full_name_hash(name, qstr.len); | ||
237 | d = lookup_hash(&qstr, parent); | ||
238 | if (!IS_ERR(d)) | 232 | if (!IS_ERR(d)) |
239 | d->d_op = &cpuset_dops; | 233 | d->d_op = &cpuset_dops; |
240 | return d; | 234 | return d; |
@@ -404,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
404 | * to continue to serve a useful existence. Next time it's released, | 398 | * to continue to serve a useful existence. Next time it's released, |
405 | * we will get notified again, if it still has 'notify_on_release' set. | 399 | * we will get notified again, if it still has 'notify_on_release' set. |
406 | * | 400 | * |
407 | * Note final arg to call_usermodehelper() is 0 - that means | 401 | * The final arg to call_usermodehelper() is 0, which means don't |
408 | * don't wait. Since we are holding the global cpuset_sem here, | 402 | * wait. The separate /sbin/cpuset_release_agent task is forked by |
409 | * and we are asking another thread (started from keventd) to rmdir a | 403 | * call_usermodehelper(), then control in this thread returns here, |
410 | * cpuset, we can't wait - or we'd deadlock with the removing thread | 404 | * without waiting for the release agent task. We don't bother to |
411 | * on cpuset_sem. | 405 | * wait because the caller of this routine has no use for the exit |
406 | * status of the /sbin/cpuset_release_agent task, so no sense holding | ||
407 | * our caller up for that. | ||
408 | * | ||
409 | * The simple act of forking that task might require more memory, | ||
410 | * which might need cpuset_sem. So this routine must be called while | ||
411 | * cpuset_sem is not held, to avoid a possible deadlock. See also | ||
412 | * comments for check_for_release(), below. | ||
412 | */ | 413 | */ |
413 | 414 | ||
414 | static int cpuset_release_agent(char *cpuset_str) | 415 | static void cpuset_release_agent(const char *pathbuf) |
415 | { | 416 | { |
416 | char *argv[3], *envp[3]; | 417 | char *argv[3], *envp[3]; |
417 | int i; | 418 | int i; |
418 | 419 | ||
420 | if (!pathbuf) | ||
421 | return; | ||
422 | |||
419 | i = 0; | 423 | i = 0; |
420 | argv[i++] = "/sbin/cpuset_release_agent"; | 424 | argv[i++] = "/sbin/cpuset_release_agent"; |
421 | argv[i++] = cpuset_str; | 425 | argv[i++] = (char *)pathbuf; |
422 | argv[i] = NULL; | 426 | argv[i] = NULL; |
423 | 427 | ||
424 | i = 0; | 428 | i = 0; |
@@ -427,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str) | |||
427 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 431 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
428 | envp[i] = NULL; | 432 | envp[i] = NULL; |
429 | 433 | ||
430 | return call_usermodehelper(argv[0], argv, envp, 0); | 434 | call_usermodehelper(argv[0], argv, envp, 0); |
435 | kfree(pathbuf); | ||
431 | } | 436 | } |
432 | 437 | ||
433 | /* | 438 | /* |
434 | * Either cs->count of using tasks transitioned to zero, or the | 439 | * Either cs->count of using tasks transitioned to zero, or the |
435 | * cs->children list of child cpusets just became empty. If this | 440 | * cs->children list of child cpusets just became empty. If this |
436 | * cs is notify_on_release() and now both the user count is zero and | 441 | * cs is notify_on_release() and now both the user count is zero and |
437 | * the list of children is empty, send notice to user land. | 442 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
443 | * buffer, to be returned via ppathbuf, so that the caller can invoke | ||
444 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | ||
445 | * Call here with cpuset_sem held. | ||
446 | * | ||
447 | * This check_for_release() routine is responsible for kmalloc'ing | ||
448 | * pathbuf. The above cpuset_release_agent() is responsible for | ||
449 | * kfree'ing pathbuf. The caller of these routines is responsible | ||
450 | * for providing a pathbuf pointer, initialized to NULL, then | ||
451 | * calling check_for_release() with cpuset_sem held and the address | ||
452 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | ||
453 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | ||
438 | */ | 454 | */ |
439 | 455 | ||
440 | static void check_for_release(struct cpuset *cs) | 456 | static void check_for_release(struct cpuset *cs, char **ppathbuf) |
441 | { | 457 | { |
442 | if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && | 458 | if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && |
443 | list_empty(&cs->children)) { | 459 | list_empty(&cs->children)) { |
@@ -447,10 +463,9 @@ static void check_for_release(struct cpuset *cs) | |||
447 | if (!buf) | 463 | if (!buf) |
448 | return; | 464 | return; |
449 | if (cpuset_path(cs, buf, PAGE_SIZE) < 0) | 465 | if (cpuset_path(cs, buf, PAGE_SIZE) < 0) |
450 | goto out; | 466 | kfree(buf); |
451 | cpuset_release_agent(buf); | 467 | else |
452 | out: | 468 | *ppathbuf = buf; |
453 | kfree(buf); | ||
454 | } | 469 | } |
455 | } | 470 | } |
456 | 471 | ||
@@ -601,10 +616,75 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
601 | return 0; | 616 | return 0; |
602 | } | 617 | } |
603 | 618 | ||
619 | /* | ||
620 | * For a given cpuset cur, partition the system as follows | ||
621 | * a. All cpus in the parent cpuset's cpus_allowed that are not part of any | ||
622 | * exclusive child cpusets | ||
623 | * b. All cpus in the current cpuset's cpus_allowed that are not part of any | ||
624 | * exclusive child cpusets | ||
625 | * Build these two partitions by calling partition_sched_domains | ||
626 | * | ||
627 | * Call with cpuset_sem held. May nest a call to the | ||
628 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | ||
629 | */ | ||
630 | |||
631 | /* | ||
632 | * Hack to avoid 2.6.13 partial node dynamic sched domain bug. | ||
633 | * Disable letting 'cpu_exclusive' cpusets define dynamic sched | ||
634 | * domains, until the sched domain can handle partial nodes. | ||
635 | * Remove this #if hackery when sched domains fixed. | ||
636 | */ | ||
637 | #if 0 | ||
638 | static void update_cpu_domains(struct cpuset *cur) | ||
639 | { | ||
640 | struct cpuset *c, *par = cur->parent; | ||
641 | cpumask_t pspan, cspan; | ||
642 | |||
643 | if (par == NULL || cpus_empty(cur->cpus_allowed)) | ||
644 | return; | ||
645 | |||
646 | /* | ||
647 | * Get all cpus from parent's cpus_allowed not part of exclusive | ||
648 | * children | ||
649 | */ | ||
650 | pspan = par->cpus_allowed; | ||
651 | list_for_each_entry(c, &par->children, sibling) { | ||
652 | if (is_cpu_exclusive(c)) | ||
653 | cpus_andnot(pspan, pspan, c->cpus_allowed); | ||
654 | } | ||
655 | if (is_removed(cur) || !is_cpu_exclusive(cur)) { | ||
656 | cpus_or(pspan, pspan, cur->cpus_allowed); | ||
657 | if (cpus_equal(pspan, cur->cpus_allowed)) | ||
658 | return; | ||
659 | cspan = CPU_MASK_NONE; | ||
660 | } else { | ||
661 | if (cpus_empty(pspan)) | ||
662 | return; | ||
663 | cspan = cur->cpus_allowed; | ||
664 | /* | ||
665 | * Get all cpus from current cpuset's cpus_allowed not part | ||
666 | * of exclusive children | ||
667 | */ | ||
668 | list_for_each_entry(c, &cur->children, sibling) { | ||
669 | if (is_cpu_exclusive(c)) | ||
670 | cpus_andnot(cspan, cspan, c->cpus_allowed); | ||
671 | } | ||
672 | } | ||
673 | |||
674 | lock_cpu_hotplug(); | ||
675 | partition_sched_domains(&pspan, &cspan); | ||
676 | unlock_cpu_hotplug(); | ||
677 | } | ||
678 | #else | ||
679 | static void update_cpu_domains(struct cpuset *cur) | ||
680 | { | ||
681 | } | ||
682 | #endif | ||
683 | |||
604 | static int update_cpumask(struct cpuset *cs, char *buf) | 684 | static int update_cpumask(struct cpuset *cs, char *buf) |
605 | { | 685 | { |
606 | struct cpuset trialcs; | 686 | struct cpuset trialcs; |
607 | int retval; | 687 | int retval, cpus_unchanged; |
608 | 688 | ||
609 | trialcs = *cs; | 689 | trialcs = *cs; |
610 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 690 | retval = cpulist_parse(buf, trialcs.cpus_allowed); |
@@ -614,9 +694,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
614 | if (cpus_empty(trialcs.cpus_allowed)) | 694 | if (cpus_empty(trialcs.cpus_allowed)) |
615 | return -ENOSPC; | 695 | return -ENOSPC; |
616 | retval = validate_change(cs, &trialcs); | 696 | retval = validate_change(cs, &trialcs); |
617 | if (retval == 0) | 697 | if (retval < 0) |
618 | cs->cpus_allowed = trialcs.cpus_allowed; | 698 | return retval; |
619 | return retval; | 699 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
700 | cs->cpus_allowed = trialcs.cpus_allowed; | ||
701 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | ||
702 | update_cpu_domains(cs); | ||
703 | return 0; | ||
620 | } | 704 | } |
621 | 705 | ||
622 | static int update_nodemask(struct cpuset *cs, char *buf) | 706 | static int update_nodemask(struct cpuset *cs, char *buf) |
@@ -652,7 +736,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
652 | { | 736 | { |
653 | int turning_on; | 737 | int turning_on; |
654 | struct cpuset trialcs; | 738 | struct cpuset trialcs; |
655 | int err; | 739 | int err, cpu_exclusive_changed; |
656 | 740 | ||
657 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | 741 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); |
658 | 742 | ||
@@ -663,23 +747,28 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
663 | clear_bit(bit, &trialcs.flags); | 747 | clear_bit(bit, &trialcs.flags); |
664 | 748 | ||
665 | err = validate_change(cs, &trialcs); | 749 | err = validate_change(cs, &trialcs); |
666 | if (err == 0) { | 750 | if (err < 0) |
667 | if (turning_on) | 751 | return err; |
668 | set_bit(bit, &cs->flags); | 752 | cpu_exclusive_changed = |
669 | else | 753 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
670 | clear_bit(bit, &cs->flags); | 754 | if (turning_on) |
671 | } | 755 | set_bit(bit, &cs->flags); |
672 | return err; | 756 | else |
757 | clear_bit(bit, &cs->flags); | ||
758 | |||
759 | if (cpu_exclusive_changed) | ||
760 | update_cpu_domains(cs); | ||
761 | return 0; | ||
673 | } | 762 | } |
674 | 763 | ||
675 | static int attach_task(struct cpuset *cs, char *buf) | 764 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
676 | { | 765 | { |
677 | pid_t pid; | 766 | pid_t pid; |
678 | struct task_struct *tsk; | 767 | struct task_struct *tsk; |
679 | struct cpuset *oldcs; | 768 | struct cpuset *oldcs; |
680 | cpumask_t cpus; | 769 | cpumask_t cpus; |
681 | 770 | ||
682 | if (sscanf(buf, "%d", &pid) != 1) | 771 | if (sscanf(pidbuf, "%d", &pid) != 1) |
683 | return -EIO; | 772 | return -EIO; |
684 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 773 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
685 | return -ENOSPC; | 774 | return -ENOSPC; |
@@ -722,7 +811,7 @@ static int attach_task(struct cpuset *cs, char *buf) | |||
722 | 811 | ||
723 | put_task_struct(tsk); | 812 | put_task_struct(tsk); |
724 | if (atomic_dec_and_test(&oldcs->count)) | 813 | if (atomic_dec_and_test(&oldcs->count)) |
725 | check_for_release(oldcs); | 814 | check_for_release(oldcs, ppathbuf); |
726 | return 0; | 815 | return 0; |
727 | } | 816 | } |
728 | 817 | ||
@@ -746,6 +835,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
746 | struct cftype *cft = __d_cft(file->f_dentry); | 835 | struct cftype *cft = __d_cft(file->f_dentry); |
747 | cpuset_filetype_t type = cft->private; | 836 | cpuset_filetype_t type = cft->private; |
748 | char *buffer; | 837 | char *buffer; |
838 | char *pathbuf = NULL; | ||
749 | int retval = 0; | 839 | int retval = 0; |
750 | 840 | ||
751 | /* Crude upper limit on largest legitimate cpulist user might write. */ | 841 | /* Crude upper limit on largest legitimate cpulist user might write. */ |
@@ -786,7 +876,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
786 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 876 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
787 | break; | 877 | break; |
788 | case FILE_TASKLIST: | 878 | case FILE_TASKLIST: |
789 | retval = attach_task(cs, buffer); | 879 | retval = attach_task(cs, buffer, &pathbuf); |
790 | break; | 880 | break; |
791 | default: | 881 | default: |
792 | retval = -EINVAL; | 882 | retval = -EINVAL; |
@@ -797,6 +887,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
797 | retval = nbytes; | 887 | retval = nbytes; |
798 | out2: | 888 | out2: |
799 | up(&cpuset_sem); | 889 | up(&cpuset_sem); |
890 | cpuset_release_agent(pathbuf); | ||
800 | out1: | 891 | out1: |
801 | kfree(buffer); | 892 | kfree(buffer); |
802 | return retval; | 893 | return retval; |
@@ -1302,6 +1393,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1302 | struct cpuset *cs = dentry->d_fsdata; | 1393 | struct cpuset *cs = dentry->d_fsdata; |
1303 | struct dentry *d; | 1394 | struct dentry *d; |
1304 | struct cpuset *parent; | 1395 | struct cpuset *parent; |
1396 | char *pathbuf = NULL; | ||
1305 | 1397 | ||
1306 | /* the vfs holds both inode->i_sem already */ | 1398 | /* the vfs holds both inode->i_sem already */ |
1307 | 1399 | ||
@@ -1315,18 +1407,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1315 | up(&cpuset_sem); | 1407 | up(&cpuset_sem); |
1316 | return -EBUSY; | 1408 | return -EBUSY; |
1317 | } | 1409 | } |
1318 | spin_lock(&cs->dentry->d_lock); | ||
1319 | parent = cs->parent; | 1410 | parent = cs->parent; |
1320 | set_bit(CS_REMOVED, &cs->flags); | 1411 | set_bit(CS_REMOVED, &cs->flags); |
1412 | if (is_cpu_exclusive(cs)) | ||
1413 | update_cpu_domains(cs); | ||
1321 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1414 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1322 | if (list_empty(&parent->children)) | 1415 | if (list_empty(&parent->children)) |
1323 | check_for_release(parent); | 1416 | check_for_release(parent, &pathbuf); |
1417 | spin_lock(&cs->dentry->d_lock); | ||
1324 | d = dget(cs->dentry); | 1418 | d = dget(cs->dentry); |
1325 | cs->dentry = NULL; | 1419 | cs->dentry = NULL; |
1326 | spin_unlock(&d->d_lock); | 1420 | spin_unlock(&d->d_lock); |
1327 | cpuset_d_remove_dir(d); | 1421 | cpuset_d_remove_dir(d); |
1328 | dput(d); | 1422 | dput(d); |
1329 | up(&cpuset_sem); | 1423 | up(&cpuset_sem); |
1424 | cpuset_release_agent(pathbuf); | ||
1330 | return 0; | 1425 | return 0; |
1331 | } | 1426 | } |
1332 | 1427 | ||
@@ -1383,10 +1478,10 @@ void __init cpuset_init_smp(void) | |||
1383 | 1478 | ||
1384 | /** | 1479 | /** |
1385 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1480 | * cpuset_fork - attach newly forked task to its parents cpuset. |
1386 | * @p: pointer to task_struct of forking parent process. | 1481 | * @tsk: pointer to task_struct of forking parent process. |
1387 | * | 1482 | * |
1388 | * Description: By default, on fork, a task inherits its | 1483 | * Description: By default, on fork, a task inherits its |
1389 | * parents cpuset. The pointer to the shared cpuset is | 1484 | * parent's cpuset. The pointer to the shared cpuset is |
1390 | * automatically copied in fork.c by dup_task_struct(). | 1485 | * automatically copied in fork.c by dup_task_struct(). |
1391 | * This cpuset_fork() routine need only increment the usage | 1486 | * This cpuset_fork() routine need only increment the usage |
1392 | * counter in that cpuset. | 1487 | * counter in that cpuset. |
@@ -1414,7 +1509,6 @@ void cpuset_fork(struct task_struct *tsk) | |||
1414 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1509 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, |
1415 | * then a zero cpuset use count is a license to any other task to | 1510 | * then a zero cpuset use count is a license to any other task to |
1416 | * nuke the cpuset immediately. | 1511 | * nuke the cpuset immediately. |
1417 | * | ||
1418 | **/ | 1512 | **/ |
1419 | 1513 | ||
1420 | void cpuset_exit(struct task_struct *tsk) | 1514 | void cpuset_exit(struct task_struct *tsk) |
@@ -1427,10 +1521,13 @@ void cpuset_exit(struct task_struct *tsk) | |||
1427 | task_unlock(tsk); | 1521 | task_unlock(tsk); |
1428 | 1522 | ||
1429 | if (notify_on_release(cs)) { | 1523 | if (notify_on_release(cs)) { |
1524 | char *pathbuf = NULL; | ||
1525 | |||
1430 | down(&cpuset_sem); | 1526 | down(&cpuset_sem); |
1431 | if (atomic_dec_and_test(&cs->count)) | 1527 | if (atomic_dec_and_test(&cs->count)) |
1432 | check_for_release(cs); | 1528 | check_for_release(cs, &pathbuf); |
1433 | up(&cpuset_sem); | 1529 | up(&cpuset_sem); |
1530 | cpuset_release_agent(pathbuf); | ||
1434 | } else { | 1531 | } else { |
1435 | atomic_dec(&cs->count); | 1532 | atomic_dec(&cs->count); |
1436 | } | 1533 | } |
@@ -1464,7 +1561,9 @@ void cpuset_init_current_mems_allowed(void) | |||
1464 | current->mems_allowed = NODE_MASK_ALL; | 1561 | current->mems_allowed = NODE_MASK_ALL; |
1465 | } | 1562 | } |
1466 | 1563 | ||
1467 | /* | 1564 | /** |
1565 | * cpuset_update_current_mems_allowed - update mems parameters to new values | ||
1566 | * | ||
1468 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1567 | * If the current tasks cpusets mems_allowed changed behind our backs, |
1469 | * update current->mems_allowed and mems_generation to the new value. | 1568 | * update current->mems_allowed and mems_generation to the new value. |
1470 | * Do not call this routine if in_interrupt(). | 1569 | * Do not call this routine if in_interrupt(). |
@@ -1483,13 +1582,20 @@ void cpuset_update_current_mems_allowed(void) | |||
1483 | } | 1582 | } |
1484 | } | 1583 | } |
1485 | 1584 | ||
1585 | /** | ||
1586 | * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed | ||
1587 | * @nodes: pointer to a node bitmap that is and-ed with mems_allowed | ||
1588 | */ | ||
1486 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | 1589 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) |
1487 | { | 1590 | { |
1488 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | 1591 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), |
1489 | MAX_NUMNODES); | 1592 | MAX_NUMNODES); |
1490 | } | 1593 | } |
1491 | 1594 | ||
1492 | /* | 1595 | /** |
1596 | * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed | ||
1597 | * @zl: the zonelist to be checked | ||
1598 | * | ||
1493 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? | 1599 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? |
1494 | */ | 1600 | */ |
1495 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | 1601 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) |
@@ -1505,8 +1611,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1505 | return 0; | 1611 | return 0; |
1506 | } | 1612 | } |
1507 | 1613 | ||
1508 | /* | 1614 | /** |
1509 | * Is 'current' valid, and is zone z allowed in current->mems_allowed? | 1615 | * cpuset_zone_allowed - is zone z allowed in current->mems_allowed |
1616 | * @z: zone in question | ||
1617 | * | ||
1618 | * Is zone z allowed in current->mems_allowed, or is | ||
1619 | * the CPU in interrupt context? (zone is always allowed in this case) | ||
1510 | */ | 1620 | */ |
1511 | int cpuset_zone_allowed(struct zone *z) | 1621 | int cpuset_zone_allowed(struct zone *z) |
1512 | { | 1622 | { |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 000000000000..334c37f5218a --- /dev/null +++ b/kernel/crash_dump.c | |||
@@ -0,0 +1,61 @@ | |||
1 | /* | ||
2 | * kernel/crash_dump.c - Memory preserving reboot related code. | ||
3 | * | ||
4 | * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) | ||
5 | * Copyright (C) IBM Corporation, 2004. All rights reserved | ||
6 | */ | ||
7 | |||
8 | #include <linux/smp_lock.h> | ||
9 | #include <linux/errno.h> | ||
10 | #include <linux/proc_fs.h> | ||
11 | #include <linux/bootmem.h> | ||
12 | #include <linux/highmem.h> | ||
13 | #include <linux/crash_dump.h> | ||
14 | |||
15 | #include <asm/io.h> | ||
16 | #include <asm/uaccess.h> | ||
17 | |||
18 | /* Stores the physical address of elf header of crash image. */ | ||
19 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | ||
20 | |||
21 | /** | ||
22 | * copy_oldmem_page - copy one page from "oldmem" | ||
23 | * @pfn: page frame number to be copied | ||
24 | * @buf: target memory address for the copy; this can be in kernel address | ||
25 | * space or user address space (see @userbuf) | ||
26 | * @csize: number of bytes to copy | ||
27 | * @offset: offset in bytes into the page (based on pfn) to begin the copy | ||
28 | * @userbuf: if set, @buf is in user address space, use copy_to_user(), | ||
29 | * otherwise @buf is in kernel address space, use memcpy(). | ||
30 | * | ||
31 | * Copy a page from "oldmem". For this page, there is no pte mapped | ||
32 | * in the current kernel. We stitch up a pte, similar to kmap_atomic. | ||
33 | */ | ||
34 | ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | ||
35 | size_t csize, unsigned long offset, int userbuf) | ||
36 | { | ||
37 | void *page, *vaddr; | ||
38 | |||
39 | if (!csize) | ||
40 | return 0; | ||
41 | |||
42 | page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
43 | if (!page) | ||
44 | return -ENOMEM; | ||
45 | |||
46 | vaddr = kmap_atomic_pfn(pfn, KM_PTE0); | ||
47 | copy_page(page, vaddr); | ||
48 | kunmap_atomic(vaddr, KM_PTE0); | ||
49 | |||
50 | if (userbuf) { | ||
51 | if (copy_to_user(buf, (page + offset), csize)) { | ||
52 | kfree(page); | ||
53 | return -EFAULT; | ||
54 | } | ||
55 | } else { | ||
56 | memcpy(buf, (page + offset), csize); | ||
57 | } | ||
58 | |||
59 | kfree(page); | ||
60 | return csize; | ||
61 | } | ||
diff --git a/kernel/exit.c b/kernel/exit.c index 2ef2ad540201..5b0fb9f09f21 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,6 +72,11 @@ repeat: | |||
72 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 72 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
73 | __exit_signal(p); | 73 | __exit_signal(p); |
74 | __exit_sighand(p); | 74 | __exit_sighand(p); |
75 | /* | ||
76 | * Note that the fastpath in sys_times depends on __exit_signal having | ||
77 | * updated the counters before a task is removed from the tasklist of | ||
78 | * the process by __unhash_process. | ||
79 | */ | ||
75 | __unhash_process(p); | 80 | __unhash_process(p); |
76 | 81 | ||
77 | /* | 82 | /* |
@@ -779,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
779 | 784 | ||
780 | profile_task_exit(tsk); | 785 | profile_task_exit(tsk); |
781 | 786 | ||
787 | WARN_ON(atomic_read(&tsk->fs_excl)); | ||
788 | |||
782 | if (unlikely(in_interrupt())) | 789 | if (unlikely(in_interrupt())) |
783 | panic("Aiee, killing interrupt handler!"); | 790 | panic("Aiee, killing interrupt handler!"); |
784 | if (unlikely(!tsk->pid)) | 791 | if (unlikely(!tsk->pid)) |
@@ -793,6 +800,17 @@ fastcall NORET_TYPE void do_exit(long code) | |||
793 | ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); | 800 | ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP); |
794 | } | 801 | } |
795 | 802 | ||
803 | /* | ||
804 | * We're taking recursive faults here in do_exit. Safest is to just | ||
805 | * leave this task alone and wait for reboot. | ||
806 | */ | ||
807 | if (unlikely(tsk->flags & PF_EXITING)) { | ||
808 | printk(KERN_ALERT | ||
809 | "Fixing recursive fault but reboot is needed!\n"); | ||
810 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
811 | schedule(); | ||
812 | } | ||
813 | |||
796 | tsk->flags |= PF_EXITING; | 814 | tsk->flags |= PF_EXITING; |
797 | 815 | ||
798 | /* | 816 | /* |
@@ -811,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
811 | acct_update_integrals(tsk); | 829 | acct_update_integrals(tsk); |
812 | update_mem_hiwater(tsk); | 830 | update_mem_hiwater(tsk); |
813 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 831 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
814 | if (group_dead) | 832 | if (group_dead) { |
833 | del_timer_sync(&tsk->signal->real_timer); | ||
815 | acct_process(code); | 834 | acct_process(code); |
835 | } | ||
816 | exit_mm(tsk); | 836 | exit_mm(tsk); |
817 | 837 | ||
818 | exit_sem(tsk); | 838 | exit_sem(tsk); |
diff --git a/kernel/fork.c b/kernel/fork.c index a28d11e10877..b65187f0c74e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -208,8 +208,10 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
208 | struct file *file; | 208 | struct file *file; |
209 | 209 | ||
210 | if (mpnt->vm_flags & VM_DONTCOPY) { | 210 | if (mpnt->vm_flags & VM_DONTCOPY) { |
211 | long pages = vma_pages(mpnt); | ||
212 | mm->total_vm -= pages; | ||
211 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 213 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
212 | -vma_pages(mpnt)); | 214 | -pages); |
213 | continue; | 215 | continue; |
214 | } | 216 | } |
215 | charge = 0; | 217 | charge = 0; |
@@ -1003,9 +1005,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1003 | p->pdeath_signal = 0; | 1005 | p->pdeath_signal = 0; |
1004 | p->exit_state = 0; | 1006 | p->exit_state = 0; |
1005 | 1007 | ||
1006 | /* Perform scheduler related setup */ | ||
1007 | sched_fork(p); | ||
1008 | |||
1009 | /* | 1008 | /* |
1010 | * Ok, make it visible to the rest of the system. | 1009 | * Ok, make it visible to the rest of the system. |
1011 | * We dont wake it up yet. | 1010 | * We dont wake it up yet. |
@@ -1014,18 +1013,24 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1014 | INIT_LIST_HEAD(&p->ptrace_children); | 1013 | INIT_LIST_HEAD(&p->ptrace_children); |
1015 | INIT_LIST_HEAD(&p->ptrace_list); | 1014 | INIT_LIST_HEAD(&p->ptrace_list); |
1016 | 1015 | ||
1016 | /* Perform scheduler related setup. Assign this task to a CPU. */ | ||
1017 | sched_fork(p, clone_flags); | ||
1018 | |||
1017 | /* Need tasklist lock for parent etc handling! */ | 1019 | /* Need tasklist lock for parent etc handling! */ |
1018 | write_lock_irq(&tasklist_lock); | 1020 | write_lock_irq(&tasklist_lock); |
1019 | 1021 | ||
1020 | /* | 1022 | /* |
1021 | * The task hasn't been attached yet, so cpus_allowed mask cannot | 1023 | * The task hasn't been attached yet, so its cpus_allowed mask will |
1022 | * have changed. The cpus_allowed mask of the parent may have | 1024 | * not be changed, nor will its assigned CPU. |
1023 | * changed after it was copied first time, and it may then move to | 1025 | * |
1024 | * another CPU - so we re-copy it here and set the child's CPU to | 1026 | * The cpus_allowed mask of the parent may have changed after it was |
1025 | * the parent's CPU. This avoids alot of nasty races. | 1027 | * copied first time - so re-copy it here, then check the child's CPU |
1028 | * to ensure it is on a valid CPU (and if not, just force it back to | ||
1029 | * parent's CPU). This avoids alot of nasty races. | ||
1026 | */ | 1030 | */ |
1027 | p->cpus_allowed = current->cpus_allowed; | 1031 | p->cpus_allowed = current->cpus_allowed; |
1028 | set_task_cpu(p, smp_processor_id()); | 1032 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed))) |
1033 | set_task_cpu(p, smp_processor_id()); | ||
1029 | 1034 | ||
1030 | /* | 1035 | /* |
1031 | * Check for pending SIGKILL! The new thread should not be allowed | 1036 | * Check for pending SIGKILL! The new thread should not be allowed |
@@ -1087,6 +1092,11 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1087 | spin_unlock(¤t->sighand->siglock); | 1092 | spin_unlock(¤t->sighand->siglock); |
1088 | } | 1093 | } |
1089 | 1094 | ||
1095 | /* | ||
1096 | * inherit ioprio | ||
1097 | */ | ||
1098 | p->ioprio = current->ioprio; | ||
1099 | |||
1090 | SET_LINKS(p); | 1100 | SET_LINKS(p); |
1091 | if (unlikely(p->ptrace & PT_PTRACED)) | 1101 | if (unlikely(p->ptrace & PT_PTRACED)) |
1092 | __ptrace_link(p, current->parent); | 1102 | __ptrace_link(p, current->parent); |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 98d62d8efeaf..3467097ca61a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/irq.h> | 9 | #include <linux/irq.h> |
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | #include <linux/delay.h> | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * Autodetection depends on the fact that any interrupt that | 15 | * Autodetection depends on the fact that any interrupt that |
@@ -26,7 +27,7 @@ static DECLARE_MUTEX(probe_sem); | |||
26 | */ | 27 | */ |
27 | unsigned long probe_irq_on(void) | 28 | unsigned long probe_irq_on(void) |
28 | { | 29 | { |
29 | unsigned long val, delay; | 30 | unsigned long val; |
30 | irq_desc_t *desc; | 31 | irq_desc_t *desc; |
31 | unsigned int i; | 32 | unsigned int i; |
32 | 33 | ||
@@ -45,8 +46,7 @@ unsigned long probe_irq_on(void) | |||
45 | } | 46 | } |
46 | 47 | ||
47 | /* Wait for longstanding interrupts to trigger. */ | 48 | /* Wait for longstanding interrupts to trigger. */ |
48 | for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) | 49 | msleep(20); |
49 | /* about 20ms delay */ barrier(); | ||
50 | 50 | ||
51 | /* | 51 | /* |
52 | * enable any unassigned irqs | 52 | * enable any unassigned irqs |
@@ -68,8 +68,7 @@ unsigned long probe_irq_on(void) | |||
68 | /* | 68 | /* |
69 | * Wait for spurious interrupts to trigger | 69 | * Wait for spurious interrupts to trigger |
70 | */ | 70 | */ |
71 | for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) | 71 | msleep(100); |
72 | /* about 100ms delay */ barrier(); | ||
73 | 72 | ||
74 | /* | 73 | /* |
75 | * Now filter out any obviously spurious interrupts | 74 | * Now filter out any obviously spurious interrupts |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 436c7d93c00a..c29f83c16497 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -172,7 +172,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
172 | 172 | ||
173 | spin_lock(&desc->lock); | 173 | spin_lock(&desc->lock); |
174 | if (!noirqdebug) | 174 | if (!noirqdebug) |
175 | note_interrupt(irq, desc, action_ret); | 175 | note_interrupt(irq, desc, action_ret, regs); |
176 | if (likely(!(desc->status & IRQ_PENDING))) | 176 | if (likely(!(desc->status & IRQ_PENDING))) |
177 | break; | 177 | break; |
178 | desc->status &= ~IRQ_PENDING; | 178 | desc->status &= ~IRQ_PENDING; |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f6297c306905..7df9abd5ec86 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -11,6 +11,83 @@ | |||
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | 13 | ||
14 | static int irqfixup; | ||
15 | |||
16 | /* | ||
17 | * Recovery handler for misrouted interrupts. | ||
18 | */ | ||
19 | |||
20 | static int misrouted_irq(int irq, struct pt_regs *regs) | ||
21 | { | ||
22 | int i; | ||
23 | irq_desc_t *desc; | ||
24 | int ok = 0; | ||
25 | int work = 0; /* Did we do work for a real IRQ */ | ||
26 | |||
27 | for(i = 1; i < NR_IRQS; i++) { | ||
28 | struct irqaction *action; | ||
29 | |||
30 | if (i == irq) /* Already tried */ | ||
31 | continue; | ||
32 | desc = &irq_desc[i]; | ||
33 | spin_lock(&desc->lock); | ||
34 | action = desc->action; | ||
35 | /* Already running on another processor */ | ||
36 | if (desc->status & IRQ_INPROGRESS) { | ||
37 | /* | ||
38 | * Already running: If it is shared get the other | ||
39 | * CPU to go looking for our mystery interrupt too | ||
40 | */ | ||
41 | if (desc->action && (desc->action->flags & SA_SHIRQ)) | ||
42 | desc->status |= IRQ_PENDING; | ||
43 | spin_unlock(&desc->lock); | ||
44 | continue; | ||
45 | } | ||
46 | /* Honour the normal IRQ locking */ | ||
47 | desc->status |= IRQ_INPROGRESS; | ||
48 | spin_unlock(&desc->lock); | ||
49 | while (action) { | ||
50 | /* Only shared IRQ handlers are safe to call */ | ||
51 | if (action->flags & SA_SHIRQ) { | ||
52 | if (action->handler(i, action->dev_id, regs) == | ||
53 | IRQ_HANDLED) | ||
54 | ok = 1; | ||
55 | } | ||
56 | action = action->next; | ||
57 | } | ||
58 | local_irq_disable(); | ||
59 | /* Now clean up the flags */ | ||
60 | spin_lock(&desc->lock); | ||
61 | action = desc->action; | ||
62 | |||
63 | /* | ||
64 | * While we were looking for a fixup someone queued a real | ||
65 | * IRQ clashing with our walk | ||
66 | */ | ||
67 | |||
68 | while ((desc->status & IRQ_PENDING) && action) { | ||
69 | /* | ||
70 | * Perform real IRQ processing for the IRQ we deferred | ||
71 | */ | ||
72 | work = 1; | ||
73 | spin_unlock(&desc->lock); | ||
74 | handle_IRQ_event(i, regs, action); | ||
75 | spin_lock(&desc->lock); | ||
76 | desc->status &= ~IRQ_PENDING; | ||
77 | } | ||
78 | desc->status &= ~IRQ_INPROGRESS; | ||
79 | /* | ||
80 | * If we did actual work for the real IRQ line we must let the | ||
81 | * IRQ controller clean up too | ||
82 | */ | ||
83 | if(work) | ||
84 | desc->handler->end(i); | ||
85 | spin_unlock(&desc->lock); | ||
86 | } | ||
87 | /* So the caller can adjust the irq error counts */ | ||
88 | return ok; | ||
89 | } | ||
90 | |||
14 | /* | 91 | /* |
15 | * If 99,900 of the previous 100,000 interrupts have not been handled | 92 | * If 99,900 of the previous 100,000 interrupts have not been handled |
16 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic | 93 | * then assume that the IRQ is stuck in some manner. Drop a diagnostic |
@@ -31,7 +108,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
31 | printk(KERN_ERR "irq event %d: bogus return value %x\n", | 108 | printk(KERN_ERR "irq event %d: bogus return value %x\n", |
32 | irq, action_ret); | 109 | irq, action_ret); |
33 | } else { | 110 | } else { |
34 | printk(KERN_ERR "irq %d: nobody cared!\n", irq); | 111 | printk(KERN_ERR "irq %d: nobody cared (try booting with " |
112 | "the \"irqpoll\" option)\n", irq); | ||
35 | } | 113 | } |
36 | dump_stack(); | 114 | dump_stack(); |
37 | printk(KERN_ERR "handlers:\n"); | 115 | printk(KERN_ERR "handlers:\n"); |
@@ -45,7 +123,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
45 | } | 123 | } |
46 | } | 124 | } |
47 | 125 | ||
48 | void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 126 | static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) |
49 | { | 127 | { |
50 | static int count = 100; | 128 | static int count = 100; |
51 | 129 | ||
@@ -55,7 +133,8 @@ void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
55 | } | 133 | } |
56 | } | 134 | } |
57 | 135 | ||
58 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | 136 | void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, |
137 | struct pt_regs *regs) | ||
59 | { | 138 | { |
60 | if (action_ret != IRQ_HANDLED) { | 139 | if (action_ret != IRQ_HANDLED) { |
61 | desc->irqs_unhandled++; | 140 | desc->irqs_unhandled++; |
@@ -63,6 +142,15 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) | |||
63 | report_bad_irq(irq, desc, action_ret); | 142 | report_bad_irq(irq, desc, action_ret); |
64 | } | 143 | } |
65 | 144 | ||
145 | if (unlikely(irqfixup)) { | ||
146 | /* Don't punish working computers */ | ||
147 | if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) { | ||
148 | int ok = misrouted_irq(irq, regs); | ||
149 | if (action_ret == IRQ_NONE) | ||
150 | desc->irqs_unhandled -= ok; | ||
151 | } | ||
152 | } | ||
153 | |||
66 | desc->irq_count++; | 154 | desc->irq_count++; |
67 | if (desc->irq_count < 100000) | 155 | if (desc->irq_count < 100000) |
68 | return; | 156 | return; |
@@ -94,3 +182,24 @@ int __init noirqdebug_setup(char *str) | |||
94 | 182 | ||
95 | __setup("noirqdebug", noirqdebug_setup); | 183 | __setup("noirqdebug", noirqdebug_setup); |
96 | 184 | ||
185 | static int __init irqfixup_setup(char *str) | ||
186 | { | ||
187 | irqfixup = 1; | ||
188 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); | ||
189 | printk(KERN_WARNING "This may impact system performance.\n"); | ||
190 | return 1; | ||
191 | } | ||
192 | |||
193 | __setup("irqfixup", irqfixup_setup); | ||
194 | |||
195 | static int __init irqpoll_setup(char *str) | ||
196 | { | ||
197 | irqfixup = 2; | ||
198 | printk(KERN_WARNING "Misrouted IRQ fixup and polling support " | ||
199 | "enabled\n"); | ||
200 | printk(KERN_WARNING "This may significantly impact system " | ||
201 | "performance\n"); | ||
202 | return 1; | ||
203 | } | ||
204 | |||
205 | __setup("irqpoll", irqpoll_setup); | ||
diff --git a/kernel/itimer.c b/kernel/itimer.c index 1dc988e0d2c7..7c1b25e25e47 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c | |||
@@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) | |||
112 | return error; | 112 | return error; |
113 | } | 113 | } |
114 | 114 | ||
115 | /* | ||
116 | * Called with P->sighand->siglock held and P->signal->real_timer inactive. | ||
117 | * If interval is nonzero, arm the timer for interval ticks from now. | ||
118 | */ | ||
119 | static inline void it_real_arm(struct task_struct *p, unsigned long interval) | ||
120 | { | ||
121 | p->signal->it_real_value = interval; /* XXX unnecessary field?? */ | ||
122 | if (interval == 0) | ||
123 | return; | ||
124 | if (interval > (unsigned long) LONG_MAX) | ||
125 | interval = LONG_MAX; | ||
126 | /* the "+ 1" below makes sure that the timer doesn't go off before | ||
127 | * the interval requested. This could happen if | ||
128 | * time requested % (usecs per jiffy) is more than the usecs left | ||
129 | * in the current jiffy */ | ||
130 | p->signal->real_timer.expires = jiffies + interval + 1; | ||
131 | add_timer(&p->signal->real_timer); | ||
132 | } | ||
133 | 115 | ||
134 | void it_real_fn(unsigned long __data) | 116 | void it_real_fn(unsigned long __data) |
135 | { | 117 | { |
136 | struct task_struct * p = (struct task_struct *) __data; | 118 | struct task_struct * p = (struct task_struct *) __data; |
119 | unsigned long inc = p->signal->it_real_incr; | ||
137 | 120 | ||
138 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); | 121 | send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); |
139 | 122 | ||
@@ -141,26 +124,42 @@ void it_real_fn(unsigned long __data) | |||
141 | * Now restart the timer if necessary. We don't need any locking | 124 | * Now restart the timer if necessary. We don't need any locking |
142 | * here because do_setitimer makes sure we have finished running | 125 | * here because do_setitimer makes sure we have finished running |
143 | * before it touches anything. | 126 | * before it touches anything. |
127 | * Note, we KNOW we are (or should be) at a jiffie edge here so | ||
128 | * we don't need the +1 stuff. Also, we want to use the prior | ||
129 | * expire value so as to not "slip" a jiffie if we are late. | ||
130 | * Deal with requesting a time prior to "now" here rather than | ||
131 | * in add_timer. | ||
144 | */ | 132 | */ |
145 | it_real_arm(p, p->signal->it_real_incr); | 133 | if (!inc) |
134 | return; | ||
135 | while (time_before_eq(p->signal->real_timer.expires, jiffies)) | ||
136 | p->signal->real_timer.expires += inc; | ||
137 | add_timer(&p->signal->real_timer); | ||
146 | } | 138 | } |
147 | 139 | ||
148 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | 140 | int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) |
149 | { | 141 | { |
150 | struct task_struct *tsk = current; | 142 | struct task_struct *tsk = current; |
151 | unsigned long val, interval; | 143 | unsigned long val, interval, expires; |
152 | cputime_t cval, cinterval, nval, ninterval; | 144 | cputime_t cval, cinterval, nval, ninterval; |
153 | 145 | ||
154 | switch (which) { | 146 | switch (which) { |
155 | case ITIMER_REAL: | 147 | case ITIMER_REAL: |
148 | again: | ||
156 | spin_lock_irq(&tsk->sighand->siglock); | 149 | spin_lock_irq(&tsk->sighand->siglock); |
157 | interval = tsk->signal->it_real_incr; | 150 | interval = tsk->signal->it_real_incr; |
158 | val = it_real_value(tsk->signal); | 151 | val = it_real_value(tsk->signal); |
159 | if (val) | 152 | /* We are sharing ->siglock with it_real_fn() */ |
160 | del_timer_sync(&tsk->signal->real_timer); | 153 | if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) { |
154 | spin_unlock_irq(&tsk->sighand->siglock); | ||
155 | goto again; | ||
156 | } | ||
161 | tsk->signal->it_real_incr = | 157 | tsk->signal->it_real_incr = |
162 | timeval_to_jiffies(&value->it_interval); | 158 | timeval_to_jiffies(&value->it_interval); |
163 | it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); | 159 | expires = timeval_to_jiffies(&value->it_value); |
160 | if (expires) | ||
161 | mod_timer(&tsk->signal->real_timer, | ||
162 | jiffies + 1 + expires); | ||
164 | spin_unlock_irq(&tsk->sighand->siglock); | 163 | spin_unlock_irq(&tsk->sighand->siglock); |
165 | if (ovalue) { | 164 | if (ovalue) { |
166 | jiffies_to_timeval(val, &ovalue->it_value); | 165 | jiffies_to_timeval(val, &ovalue->it_value); |
diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 000000000000..cdd4dcd8fb63 --- /dev/null +++ b/kernel/kexec.c | |||
@@ -0,0 +1,1063 @@ | |||
1 | /* | ||
2 | * kexec.c - kexec system call | ||
3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | ||
4 | * | ||
5 | * This source code is licensed under the GNU General Public License, | ||
6 | * Version 2. See the file COPYING for more details. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/file.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/fs.h> | ||
13 | #include <linux/kexec.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/list.h> | ||
16 | #include <linux/highmem.h> | ||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/reboot.h> | ||
19 | #include <linux/syscalls.h> | ||
20 | #include <linux/ioport.h> | ||
21 | #include <linux/hardirq.h> | ||
22 | |||
23 | #include <asm/page.h> | ||
24 | #include <asm/uaccess.h> | ||
25 | #include <asm/io.h> | ||
26 | #include <asm/system.h> | ||
27 | #include <asm/semaphore.h> | ||
28 | |||
29 | /* Location of the reserved area for the crash kernel */ | ||
30 | struct resource crashk_res = { | ||
31 | .name = "Crash kernel", | ||
32 | .start = 0, | ||
33 | .end = 0, | ||
34 | .flags = IORESOURCE_BUSY | IORESOURCE_MEM | ||
35 | }; | ||
36 | |||
37 | int kexec_should_crash(struct task_struct *p) | ||
38 | { | ||
39 | if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) | ||
40 | return 1; | ||
41 | return 0; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * When kexec transitions to the new kernel there is a one-to-one | ||
46 | * mapping between physical and virtual addresses. On processors | ||
47 | * where you can disable the MMU this is trivial, and easy. For | ||
48 | * others it is still a simple predictable page table to setup. | ||
49 | * | ||
50 | * In that environment kexec copies the new kernel to its final | ||
51 | * resting place. This means I can only support memory whose | ||
52 | * physical address can fit in an unsigned long. In particular | ||
53 | * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. | ||
54 | * If the assembly stub has more restrictive requirements | ||
55 | * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be | ||
56 | * defined more restrictively in <asm/kexec.h>. | ||
57 | * | ||
58 | * The code for the transition from the current kernel to the | ||
59 | * the new kernel is placed in the control_code_buffer, whose size | ||
60 | * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single | ||
61 | * page of memory is necessary, but some architectures require more. | ||
62 | * Because this memory must be identity mapped in the transition from | ||
63 | * virtual to physical addresses it must live in the range | ||
64 | * 0 - TASK_SIZE, as only the user space mappings are arbitrarily | ||
65 | * modifiable. | ||
66 | * | ||
67 | * The assembly stub in the control code buffer is passed a linked list | ||
68 | * of descriptor pages detailing the source pages of the new kernel, | ||
69 | * and the destination addresses of those source pages. As this data | ||
70 | * structure is not used in the context of the current OS, it must | ||
71 | * be self-contained. | ||
72 | * | ||
73 | * The code has been made to work with highmem pages and will use a | ||
74 | * destination page in its final resting place (if it happens | ||
75 | * to allocate it). The end product of this is that most of the | ||
76 | * physical address space, and most of RAM can be used. | ||
77 | * | ||
78 | * Future directions include: | ||
79 | * - allocating a page table with the control code buffer identity | ||
80 | * mapped, to simplify machine_kexec and make kexec_on_panic more | ||
81 | * reliable. | ||
82 | */ | ||
83 | |||
84 | /* | ||
85 | * KIMAGE_NO_DEST is an impossible destination address..., for | ||
86 | * allocating pages whose destination address we do not care about. | ||
87 | */ | ||
88 | #define KIMAGE_NO_DEST (-1UL) | ||
89 | |||
90 | static int kimage_is_destination_range(struct kimage *image, | ||
91 | unsigned long start, unsigned long end); | ||
92 | static struct page *kimage_alloc_page(struct kimage *image, | ||
93 | unsigned int gfp_mask, | ||
94 | unsigned long dest); | ||
95 | |||
96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | ||
97 | unsigned long nr_segments, | ||
98 | struct kexec_segment __user *segments) | ||
99 | { | ||
100 | size_t segment_bytes; | ||
101 | struct kimage *image; | ||
102 | unsigned long i; | ||
103 | int result; | ||
104 | |||
105 | /* Allocate a controlling structure */ | ||
106 | result = -ENOMEM; | ||
107 | image = kmalloc(sizeof(*image), GFP_KERNEL); | ||
108 | if (!image) | ||
109 | goto out; | ||
110 | |||
111 | memset(image, 0, sizeof(*image)); | ||
112 | image->head = 0; | ||
113 | image->entry = &image->head; | ||
114 | image->last_entry = &image->head; | ||
115 | image->control_page = ~0; /* By default this does not apply */ | ||
116 | image->start = entry; | ||
117 | image->type = KEXEC_TYPE_DEFAULT; | ||
118 | |||
119 | /* Initialize the list of control pages */ | ||
120 | INIT_LIST_HEAD(&image->control_pages); | ||
121 | |||
122 | /* Initialize the list of destination pages */ | ||
123 | INIT_LIST_HEAD(&image->dest_pages); | ||
124 | |||
125 | /* Initialize the list of unuseable pages */ | ||
126 | INIT_LIST_HEAD(&image->unuseable_pages); | ||
127 | |||
128 | /* Read in the segments */ | ||
129 | image->nr_segments = nr_segments; | ||
130 | segment_bytes = nr_segments * sizeof(*segments); | ||
131 | result = copy_from_user(image->segment, segments, segment_bytes); | ||
132 | if (result) | ||
133 | goto out; | ||
134 | |||
135 | /* | ||
136 | * Verify we have good destination addresses. The caller is | ||
137 | * responsible for making certain we don't attempt to load | ||
138 | * the new image into invalid or reserved areas of RAM. This | ||
139 | * just verifies it is an address we can use. | ||
140 | * | ||
141 | * Since the kernel does everything in page size chunks ensure | ||
142 | * the destination addreses are page aligned. Too many | ||
143 | * special cases crop of when we don't do this. The most | ||
144 | * insidious is getting overlapping destination addresses | ||
145 | * simply because addresses are changed to page size | ||
146 | * granularity. | ||
147 | */ | ||
148 | result = -EADDRNOTAVAIL; | ||
149 | for (i = 0; i < nr_segments; i++) { | ||
150 | unsigned long mstart, mend; | ||
151 | |||
152 | mstart = image->segment[i].mem; | ||
153 | mend = mstart + image->segment[i].memsz; | ||
154 | if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) | ||
155 | goto out; | ||
156 | if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) | ||
157 | goto out; | ||
158 | } | ||
159 | |||
160 | /* Verify our destination addresses do not overlap. | ||
161 | * If we alloed overlapping destination addresses | ||
162 | * through very weird things can happen with no | ||
163 | * easy explanation as one segment stops on another. | ||
164 | */ | ||
165 | result = -EINVAL; | ||
166 | for (i = 0; i < nr_segments; i++) { | ||
167 | unsigned long mstart, mend; | ||
168 | unsigned long j; | ||
169 | |||
170 | mstart = image->segment[i].mem; | ||
171 | mend = mstart + image->segment[i].memsz; | ||
172 | for (j = 0; j < i; j++) { | ||
173 | unsigned long pstart, pend; | ||
174 | pstart = image->segment[j].mem; | ||
175 | pend = pstart + image->segment[j].memsz; | ||
176 | /* Do the segments overlap ? */ | ||
177 | if ((mend > pstart) && (mstart < pend)) | ||
178 | goto out; | ||
179 | } | ||
180 | } | ||
181 | |||
182 | /* Ensure our buffer sizes are strictly less than | ||
183 | * our memory sizes. This should always be the case, | ||
184 | * and it is easier to check up front than to be surprised | ||
185 | * later on. | ||
186 | */ | ||
187 | result = -EINVAL; | ||
188 | for (i = 0; i < nr_segments; i++) { | ||
189 | if (image->segment[i].bufsz > image->segment[i].memsz) | ||
190 | goto out; | ||
191 | } | ||
192 | |||
193 | result = 0; | ||
194 | out: | ||
195 | if (result == 0) | ||
196 | *rimage = image; | ||
197 | else | ||
198 | kfree(image); | ||
199 | |||
200 | return result; | ||
201 | |||
202 | } | ||
203 | |||
204 | static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, | ||
205 | unsigned long nr_segments, | ||
206 | struct kexec_segment __user *segments) | ||
207 | { | ||
208 | int result; | ||
209 | struct kimage *image; | ||
210 | |||
211 | /* Allocate and initialize a controlling structure */ | ||
212 | image = NULL; | ||
213 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | ||
214 | if (result) | ||
215 | goto out; | ||
216 | |||
217 | *rimage = image; | ||
218 | |||
219 | /* | ||
220 | * Find a location for the control code buffer, and add it | ||
221 | * the vector of segments so that it's pages will also be | ||
222 | * counted as destination pages. | ||
223 | */ | ||
224 | result = -ENOMEM; | ||
225 | image->control_code_page = kimage_alloc_control_pages(image, | ||
226 | get_order(KEXEC_CONTROL_CODE_SIZE)); | ||
227 | if (!image->control_code_page) { | ||
228 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | ||
229 | goto out; | ||
230 | } | ||
231 | |||
232 | result = 0; | ||
233 | out: | ||
234 | if (result == 0) | ||
235 | *rimage = image; | ||
236 | else | ||
237 | kfree(image); | ||
238 | |||
239 | return result; | ||
240 | } | ||
241 | |||
242 | static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, | ||
243 | unsigned long nr_segments, | ||
244 | struct kexec_segment __user *segments) | ||
245 | { | ||
246 | int result; | ||
247 | struct kimage *image; | ||
248 | unsigned long i; | ||
249 | |||
250 | image = NULL; | ||
251 | /* Verify we have a valid entry point */ | ||
252 | if ((entry < crashk_res.start) || (entry > crashk_res.end)) { | ||
253 | result = -EADDRNOTAVAIL; | ||
254 | goto out; | ||
255 | } | ||
256 | |||
257 | /* Allocate and initialize a controlling structure */ | ||
258 | result = do_kimage_alloc(&image, entry, nr_segments, segments); | ||
259 | if (result) | ||
260 | goto out; | ||
261 | |||
262 | /* Enable the special crash kernel control page | ||
263 | * allocation policy. | ||
264 | */ | ||
265 | image->control_page = crashk_res.start; | ||
266 | image->type = KEXEC_TYPE_CRASH; | ||
267 | |||
268 | /* | ||
269 | * Verify we have good destination addresses. Normally | ||
270 | * the caller is responsible for making certain we don't | ||
271 | * attempt to load the new image into invalid or reserved | ||
272 | * areas of RAM. But crash kernels are preloaded into a | ||
273 | * reserved area of ram. We must ensure the addresses | ||
274 | * are in the reserved area otherwise preloading the | ||
275 | * kernel could corrupt things. | ||
276 | */ | ||
277 | result = -EADDRNOTAVAIL; | ||
278 | for (i = 0; i < nr_segments; i++) { | ||
279 | unsigned long mstart, mend; | ||
280 | |||
281 | mstart = image->segment[i].mem; | ||
282 | mend = mstart + image->segment[i].memsz - 1; | ||
283 | /* Ensure we are within the crash kernel limits */ | ||
284 | if ((mstart < crashk_res.start) || (mend > crashk_res.end)) | ||
285 | goto out; | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * Find a location for the control code buffer, and add | ||
290 | * the vector of segments so that it's pages will also be | ||
291 | * counted as destination pages. | ||
292 | */ | ||
293 | result = -ENOMEM; | ||
294 | image->control_code_page = kimage_alloc_control_pages(image, | ||
295 | get_order(KEXEC_CONTROL_CODE_SIZE)); | ||
296 | if (!image->control_code_page) { | ||
297 | printk(KERN_ERR "Could not allocate control_code_buffer\n"); | ||
298 | goto out; | ||
299 | } | ||
300 | |||
301 | result = 0; | ||
302 | out: | ||
303 | if (result == 0) | ||
304 | *rimage = image; | ||
305 | else | ||
306 | kfree(image); | ||
307 | |||
308 | return result; | ||
309 | } | ||
310 | |||
311 | static int kimage_is_destination_range(struct kimage *image, | ||
312 | unsigned long start, | ||
313 | unsigned long end) | ||
314 | { | ||
315 | unsigned long i; | ||
316 | |||
317 | for (i = 0; i < image->nr_segments; i++) { | ||
318 | unsigned long mstart, mend; | ||
319 | |||
320 | mstart = image->segment[i].mem; | ||
321 | mend = mstart + image->segment[i].memsz; | ||
322 | if ((end > mstart) && (start < mend)) | ||
323 | return 1; | ||
324 | } | ||
325 | |||
326 | return 0; | ||
327 | } | ||
328 | |||
329 | static struct page *kimage_alloc_pages(unsigned int gfp_mask, | ||
330 | unsigned int order) | ||
331 | { | ||
332 | struct page *pages; | ||
333 | |||
334 | pages = alloc_pages(gfp_mask, order); | ||
335 | if (pages) { | ||
336 | unsigned int count, i; | ||
337 | pages->mapping = NULL; | ||
338 | pages->private = order; | ||
339 | count = 1 << order; | ||
340 | for (i = 0; i < count; i++) | ||
341 | SetPageReserved(pages + i); | ||
342 | } | ||
343 | |||
344 | return pages; | ||
345 | } | ||
346 | |||
347 | static void kimage_free_pages(struct page *page) | ||
348 | { | ||
349 | unsigned int order, count, i; | ||
350 | |||
351 | order = page->private; | ||
352 | count = 1 << order; | ||
353 | for (i = 0; i < count; i++) | ||
354 | ClearPageReserved(page + i); | ||
355 | __free_pages(page, order); | ||
356 | } | ||
357 | |||
358 | static void kimage_free_page_list(struct list_head *list) | ||
359 | { | ||
360 | struct list_head *pos, *next; | ||
361 | |||
362 | list_for_each_safe(pos, next, list) { | ||
363 | struct page *page; | ||
364 | |||
365 | page = list_entry(pos, struct page, lru); | ||
366 | list_del(&page->lru); | ||
367 | kimage_free_pages(page); | ||
368 | } | ||
369 | } | ||
370 | |||
371 | static struct page *kimage_alloc_normal_control_pages(struct kimage *image, | ||
372 | unsigned int order) | ||
373 | { | ||
374 | /* Control pages are special, they are the intermediaries | ||
375 | * that are needed while we copy the rest of the pages | ||
376 | * to their final resting place. As such they must | ||
377 | * not conflict with either the destination addresses | ||
378 | * or memory the kernel is already using. | ||
379 | * | ||
380 | * The only case where we really need more than one of | ||
381 | * these are for architectures where we cannot disable | ||
382 | * the MMU and must instead generate an identity mapped | ||
383 | * page table for all of the memory. | ||
384 | * | ||
385 | * At worst this runs in O(N) of the image size. | ||
386 | */ | ||
387 | struct list_head extra_pages; | ||
388 | struct page *pages; | ||
389 | unsigned int count; | ||
390 | |||
391 | count = 1 << order; | ||
392 | INIT_LIST_HEAD(&extra_pages); | ||
393 | |||
394 | /* Loop while I can allocate a page and the page allocated | ||
395 | * is a destination page. | ||
396 | */ | ||
397 | do { | ||
398 | unsigned long pfn, epfn, addr, eaddr; | ||
399 | |||
400 | pages = kimage_alloc_pages(GFP_KERNEL, order); | ||
401 | if (!pages) | ||
402 | break; | ||
403 | pfn = page_to_pfn(pages); | ||
404 | epfn = pfn + count; | ||
405 | addr = pfn << PAGE_SHIFT; | ||
406 | eaddr = epfn << PAGE_SHIFT; | ||
407 | if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || | ||
408 | kimage_is_destination_range(image, addr, eaddr)) { | ||
409 | list_add(&pages->lru, &extra_pages); | ||
410 | pages = NULL; | ||
411 | } | ||
412 | } while (!pages); | ||
413 | |||
414 | if (pages) { | ||
415 | /* Remember the allocated page... */ | ||
416 | list_add(&pages->lru, &image->control_pages); | ||
417 | |||
418 | /* Because the page is already in it's destination | ||
419 | * location we will never allocate another page at | ||
420 | * that address. Therefore kimage_alloc_pages | ||
421 | * will not return it (again) and we don't need | ||
422 | * to give it an entry in image->segment[]. | ||
423 | */ | ||
424 | } | ||
425 | /* Deal with the destination pages I have inadvertently allocated. | ||
426 | * | ||
427 | * Ideally I would convert multi-page allocations into single | ||
428 | * page allocations, and add everyting to image->dest_pages. | ||
429 | * | ||
430 | * For now it is simpler to just free the pages. | ||
431 | */ | ||
432 | kimage_free_page_list(&extra_pages); | ||
433 | |||
434 | return pages; | ||
435 | } | ||
436 | |||
437 | static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | ||
438 | unsigned int order) | ||
439 | { | ||
440 | /* Control pages are special, they are the intermediaries | ||
441 | * that are needed while we copy the rest of the pages | ||
442 | * to their final resting place. As such they must | ||
443 | * not conflict with either the destination addresses | ||
444 | * or memory the kernel is already using. | ||
445 | * | ||
446 | * Control pages are also the only pags we must allocate | ||
447 | * when loading a crash kernel. All of the other pages | ||
448 | * are specified by the segments and we just memcpy | ||
449 | * into them directly. | ||
450 | * | ||
451 | * The only case where we really need more than one of | ||
452 | * these are for architectures where we cannot disable | ||
453 | * the MMU and must instead generate an identity mapped | ||
454 | * page table for all of the memory. | ||
455 | * | ||
456 | * Given the low demand this implements a very simple | ||
457 | * allocator that finds the first hole of the appropriate | ||
458 | * size in the reserved memory region, and allocates all | ||
459 | * of the memory up to and including the hole. | ||
460 | */ | ||
461 | unsigned long hole_start, hole_end, size; | ||
462 | struct page *pages; | ||
463 | |||
464 | pages = NULL; | ||
465 | size = (1 << order) << PAGE_SHIFT; | ||
466 | hole_start = (image->control_page + (size - 1)) & ~(size - 1); | ||
467 | hole_end = hole_start + size - 1; | ||
468 | while (hole_end <= crashk_res.end) { | ||
469 | unsigned long i; | ||
470 | |||
471 | if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) | ||
472 | break; | ||
473 | if (hole_end > crashk_res.end) | ||
474 | break; | ||
475 | /* See if I overlap any of the segments */ | ||
476 | for (i = 0; i < image->nr_segments; i++) { | ||
477 | unsigned long mstart, mend; | ||
478 | |||
479 | mstart = image->segment[i].mem; | ||
480 | mend = mstart + image->segment[i].memsz - 1; | ||
481 | if ((hole_end >= mstart) && (hole_start <= mend)) { | ||
482 | /* Advance the hole to the end of the segment */ | ||
483 | hole_start = (mend + (size - 1)) & ~(size - 1); | ||
484 | hole_end = hole_start + size - 1; | ||
485 | break; | ||
486 | } | ||
487 | } | ||
488 | /* If I don't overlap any segments I have found my hole! */ | ||
489 | if (i == image->nr_segments) { | ||
490 | pages = pfn_to_page(hole_start >> PAGE_SHIFT); | ||
491 | break; | ||
492 | } | ||
493 | } | ||
494 | if (pages) | ||
495 | image->control_page = hole_end; | ||
496 | |||
497 | return pages; | ||
498 | } | ||
499 | |||
500 | |||
501 | struct page *kimage_alloc_control_pages(struct kimage *image, | ||
502 | unsigned int order) | ||
503 | { | ||
504 | struct page *pages = NULL; | ||
505 | |||
506 | switch (image->type) { | ||
507 | case KEXEC_TYPE_DEFAULT: | ||
508 | pages = kimage_alloc_normal_control_pages(image, order); | ||
509 | break; | ||
510 | case KEXEC_TYPE_CRASH: | ||
511 | pages = kimage_alloc_crash_control_pages(image, order); | ||
512 | break; | ||
513 | } | ||
514 | |||
515 | return pages; | ||
516 | } | ||
517 | |||
518 | static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) | ||
519 | { | ||
520 | if (*image->entry != 0) | ||
521 | image->entry++; | ||
522 | |||
523 | if (image->entry == image->last_entry) { | ||
524 | kimage_entry_t *ind_page; | ||
525 | struct page *page; | ||
526 | |||
527 | page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); | ||
528 | if (!page) | ||
529 | return -ENOMEM; | ||
530 | |||
531 | ind_page = page_address(page); | ||
532 | *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; | ||
533 | image->entry = ind_page; | ||
534 | image->last_entry = ind_page + | ||
535 | ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); | ||
536 | } | ||
537 | *image->entry = entry; | ||
538 | image->entry++; | ||
539 | *image->entry = 0; | ||
540 | |||
541 | return 0; | ||
542 | } | ||
543 | |||
544 | static int kimage_set_destination(struct kimage *image, | ||
545 | unsigned long destination) | ||
546 | { | ||
547 | int result; | ||
548 | |||
549 | destination &= PAGE_MASK; | ||
550 | result = kimage_add_entry(image, destination | IND_DESTINATION); | ||
551 | if (result == 0) | ||
552 | image->destination = destination; | ||
553 | |||
554 | return result; | ||
555 | } | ||
556 | |||
557 | |||
558 | static int kimage_add_page(struct kimage *image, unsigned long page) | ||
559 | { | ||
560 | int result; | ||
561 | |||
562 | page &= PAGE_MASK; | ||
563 | result = kimage_add_entry(image, page | IND_SOURCE); | ||
564 | if (result == 0) | ||
565 | image->destination += PAGE_SIZE; | ||
566 | |||
567 | return result; | ||
568 | } | ||
569 | |||
570 | |||
571 | static void kimage_free_extra_pages(struct kimage *image) | ||
572 | { | ||
573 | /* Walk through and free any extra destination pages I may have */ | ||
574 | kimage_free_page_list(&image->dest_pages); | ||
575 | |||
576 | /* Walk through and free any unuseable pages I have cached */ | ||
577 | kimage_free_page_list(&image->unuseable_pages); | ||
578 | |||
579 | } | ||
580 | static int kimage_terminate(struct kimage *image) | ||
581 | { | ||
582 | if (*image->entry != 0) | ||
583 | image->entry++; | ||
584 | |||
585 | *image->entry = IND_DONE; | ||
586 | |||
587 | return 0; | ||
588 | } | ||
589 | |||
590 | #define for_each_kimage_entry(image, ptr, entry) \ | ||
591 | for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ | ||
592 | ptr = (entry & IND_INDIRECTION)? \ | ||
593 | phys_to_virt((entry & PAGE_MASK)): ptr +1) | ||
594 | |||
595 | static void kimage_free_entry(kimage_entry_t entry) | ||
596 | { | ||
597 | struct page *page; | ||
598 | |||
599 | page = pfn_to_page(entry >> PAGE_SHIFT); | ||
600 | kimage_free_pages(page); | ||
601 | } | ||
602 | |||
603 | static void kimage_free(struct kimage *image) | ||
604 | { | ||
605 | kimage_entry_t *ptr, entry; | ||
606 | kimage_entry_t ind = 0; | ||
607 | |||
608 | if (!image) | ||
609 | return; | ||
610 | |||
611 | kimage_free_extra_pages(image); | ||
612 | for_each_kimage_entry(image, ptr, entry) { | ||
613 | if (entry & IND_INDIRECTION) { | ||
614 | /* Free the previous indirection page */ | ||
615 | if (ind & IND_INDIRECTION) | ||
616 | kimage_free_entry(ind); | ||
617 | /* Save this indirection page until we are | ||
618 | * done with it. | ||
619 | */ | ||
620 | ind = entry; | ||
621 | } | ||
622 | else if (entry & IND_SOURCE) | ||
623 | kimage_free_entry(entry); | ||
624 | } | ||
625 | /* Free the final indirection page */ | ||
626 | if (ind & IND_INDIRECTION) | ||
627 | kimage_free_entry(ind); | ||
628 | |||
629 | /* Handle any machine specific cleanup */ | ||
630 | machine_kexec_cleanup(image); | ||
631 | |||
632 | /* Free the kexec control pages... */ | ||
633 | kimage_free_page_list(&image->control_pages); | ||
634 | kfree(image); | ||
635 | } | ||
636 | |||
637 | static kimage_entry_t *kimage_dst_used(struct kimage *image, | ||
638 | unsigned long page) | ||
639 | { | ||
640 | kimage_entry_t *ptr, entry; | ||
641 | unsigned long destination = 0; | ||
642 | |||
643 | for_each_kimage_entry(image, ptr, entry) { | ||
644 | if (entry & IND_DESTINATION) | ||
645 | destination = entry & PAGE_MASK; | ||
646 | else if (entry & IND_SOURCE) { | ||
647 | if (page == destination) | ||
648 | return ptr; | ||
649 | destination += PAGE_SIZE; | ||
650 | } | ||
651 | } | ||
652 | |||
653 | return NULL; | ||
654 | } | ||
655 | |||
656 | static struct page *kimage_alloc_page(struct kimage *image, | ||
657 | unsigned int gfp_mask, | ||
658 | unsigned long destination) | ||
659 | { | ||
660 | /* | ||
661 | * Here we implement safeguards to ensure that a source page | ||
662 | * is not copied to its destination page before the data on | ||
663 | * the destination page is no longer useful. | ||
664 | * | ||
665 | * To do this we maintain the invariant that a source page is | ||
666 | * either its own destination page, or it is not a | ||
667 | * destination page at all. | ||
668 | * | ||
669 | * That is slightly stronger than required, but the proof | ||
670 | * that no problems will not occur is trivial, and the | ||
671 | * implementation is simply to verify. | ||
672 | * | ||
673 | * When allocating all pages normally this algorithm will run | ||
674 | * in O(N) time, but in the worst case it will run in O(N^2) | ||
675 | * time. If the runtime is a problem the data structures can | ||
676 | * be fixed. | ||
677 | */ | ||
678 | struct page *page; | ||
679 | unsigned long addr; | ||
680 | |||
681 | /* | ||
682 | * Walk through the list of destination pages, and see if I | ||
683 | * have a match. | ||
684 | */ | ||
685 | list_for_each_entry(page, &image->dest_pages, lru) { | ||
686 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
687 | if (addr == destination) { | ||
688 | list_del(&page->lru); | ||
689 | return page; | ||
690 | } | ||
691 | } | ||
692 | page = NULL; | ||
693 | while (1) { | ||
694 | kimage_entry_t *old; | ||
695 | |||
696 | /* Allocate a page, if we run out of memory give up */ | ||
697 | page = kimage_alloc_pages(gfp_mask, 0); | ||
698 | if (!page) | ||
699 | return NULL; | ||
700 | /* If the page cannot be used file it away */ | ||
701 | if (page_to_pfn(page) > | ||
702 | (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { | ||
703 | list_add(&page->lru, &image->unuseable_pages); | ||
704 | continue; | ||
705 | } | ||
706 | addr = page_to_pfn(page) << PAGE_SHIFT; | ||
707 | |||
708 | /* If it is the destination page we want use it */ | ||
709 | if (addr == destination) | ||
710 | break; | ||
711 | |||
712 | /* If the page is not a destination page use it */ | ||
713 | if (!kimage_is_destination_range(image, addr, | ||
714 | addr + PAGE_SIZE)) | ||
715 | break; | ||
716 | |||
717 | /* | ||
718 | * I know that the page is someones destination page. | ||
719 | * See if there is already a source page for this | ||
720 | * destination page. And if so swap the source pages. | ||
721 | */ | ||
722 | old = kimage_dst_used(image, addr); | ||
723 | if (old) { | ||
724 | /* If so move it */ | ||
725 | unsigned long old_addr; | ||
726 | struct page *old_page; | ||
727 | |||
728 | old_addr = *old & PAGE_MASK; | ||
729 | old_page = pfn_to_page(old_addr >> PAGE_SHIFT); | ||
730 | copy_highpage(page, old_page); | ||
731 | *old = addr | (*old & ~PAGE_MASK); | ||
732 | |||
733 | /* The old page I have found cannot be a | ||
734 | * destination page, so return it. | ||
735 | */ | ||
736 | addr = old_addr; | ||
737 | page = old_page; | ||
738 | break; | ||
739 | } | ||
740 | else { | ||
741 | /* Place the page on the destination list I | ||
742 | * will use it later. | ||
743 | */ | ||
744 | list_add(&page->lru, &image->dest_pages); | ||
745 | } | ||
746 | } | ||
747 | |||
748 | return page; | ||
749 | } | ||
750 | |||
751 | static int kimage_load_normal_segment(struct kimage *image, | ||
752 | struct kexec_segment *segment) | ||
753 | { | ||
754 | unsigned long maddr; | ||
755 | unsigned long ubytes, mbytes; | ||
756 | int result; | ||
757 | unsigned char __user *buf; | ||
758 | |||
759 | result = 0; | ||
760 | buf = segment->buf; | ||
761 | ubytes = segment->bufsz; | ||
762 | mbytes = segment->memsz; | ||
763 | maddr = segment->mem; | ||
764 | |||
765 | result = kimage_set_destination(image, maddr); | ||
766 | if (result < 0) | ||
767 | goto out; | ||
768 | |||
769 | while (mbytes) { | ||
770 | struct page *page; | ||
771 | char *ptr; | ||
772 | size_t uchunk, mchunk; | ||
773 | |||
774 | page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); | ||
775 | if (page == 0) { | ||
776 | result = -ENOMEM; | ||
777 | goto out; | ||
778 | } | ||
779 | result = kimage_add_page(image, page_to_pfn(page) | ||
780 | << PAGE_SHIFT); | ||
781 | if (result < 0) | ||
782 | goto out; | ||
783 | |||
784 | ptr = kmap(page); | ||
785 | /* Start with a clear page */ | ||
786 | memset(ptr, 0, PAGE_SIZE); | ||
787 | ptr += maddr & ~PAGE_MASK; | ||
788 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | ||
789 | if (mchunk > mbytes) | ||
790 | mchunk = mbytes; | ||
791 | |||
792 | uchunk = mchunk; | ||
793 | if (uchunk > ubytes) | ||
794 | uchunk = ubytes; | ||
795 | |||
796 | result = copy_from_user(ptr, buf, uchunk); | ||
797 | kunmap(page); | ||
798 | if (result) { | ||
799 | result = (result < 0) ? result : -EIO; | ||
800 | goto out; | ||
801 | } | ||
802 | ubytes -= uchunk; | ||
803 | maddr += mchunk; | ||
804 | buf += mchunk; | ||
805 | mbytes -= mchunk; | ||
806 | } | ||
807 | out: | ||
808 | return result; | ||
809 | } | ||
810 | |||
811 | static int kimage_load_crash_segment(struct kimage *image, | ||
812 | struct kexec_segment *segment) | ||
813 | { | ||
814 | /* For crash dumps kernels we simply copy the data from | ||
815 | * user space to it's destination. | ||
816 | * We do things a page at a time for the sake of kmap. | ||
817 | */ | ||
818 | unsigned long maddr; | ||
819 | unsigned long ubytes, mbytes; | ||
820 | int result; | ||
821 | unsigned char __user *buf; | ||
822 | |||
823 | result = 0; | ||
824 | buf = segment->buf; | ||
825 | ubytes = segment->bufsz; | ||
826 | mbytes = segment->memsz; | ||
827 | maddr = segment->mem; | ||
828 | while (mbytes) { | ||
829 | struct page *page; | ||
830 | char *ptr; | ||
831 | size_t uchunk, mchunk; | ||
832 | |||
833 | page = pfn_to_page(maddr >> PAGE_SHIFT); | ||
834 | if (page == 0) { | ||
835 | result = -ENOMEM; | ||
836 | goto out; | ||
837 | } | ||
838 | ptr = kmap(page); | ||
839 | ptr += maddr & ~PAGE_MASK; | ||
840 | mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); | ||
841 | if (mchunk > mbytes) | ||
842 | mchunk = mbytes; | ||
843 | |||
844 | uchunk = mchunk; | ||
845 | if (uchunk > ubytes) { | ||
846 | uchunk = ubytes; | ||
847 | /* Zero the trailing part of the page */ | ||
848 | memset(ptr + uchunk, 0, mchunk - uchunk); | ||
849 | } | ||
850 | result = copy_from_user(ptr, buf, uchunk); | ||
851 | kunmap(page); | ||
852 | if (result) { | ||
853 | result = (result < 0) ? result : -EIO; | ||
854 | goto out; | ||
855 | } | ||
856 | ubytes -= uchunk; | ||
857 | maddr += mchunk; | ||
858 | buf += mchunk; | ||
859 | mbytes -= mchunk; | ||
860 | } | ||
861 | out: | ||
862 | return result; | ||
863 | } | ||
864 | |||
865 | static int kimage_load_segment(struct kimage *image, | ||
866 | struct kexec_segment *segment) | ||
867 | { | ||
868 | int result = -ENOMEM; | ||
869 | |||
870 | switch (image->type) { | ||
871 | case KEXEC_TYPE_DEFAULT: | ||
872 | result = kimage_load_normal_segment(image, segment); | ||
873 | break; | ||
874 | case KEXEC_TYPE_CRASH: | ||
875 | result = kimage_load_crash_segment(image, segment); | ||
876 | break; | ||
877 | } | ||
878 | |||
879 | return result; | ||
880 | } | ||
881 | |||
882 | /* | ||
883 | * Exec Kernel system call: for obvious reasons only root may call it. | ||
884 | * | ||
885 | * This call breaks up into three pieces. | ||
886 | * - A generic part which loads the new kernel from the current | ||
887 | * address space, and very carefully places the data in the | ||
888 | * allocated pages. | ||
889 | * | ||
890 | * - A generic part that interacts with the kernel and tells all of | ||
891 | * the devices to shut down. Preventing on-going dmas, and placing | ||
892 | * the devices in a consistent state so a later kernel can | ||
893 | * reinitialize them. | ||
894 | * | ||
895 | * - A machine specific part that includes the syscall number | ||
896 | * and the copies the image to it's final destination. And | ||
897 | * jumps into the image at entry. | ||
898 | * | ||
899 | * kexec does not sync, or unmount filesystems so if you need | ||
900 | * that to happen you need to do that yourself. | ||
901 | */ | ||
902 | struct kimage *kexec_image = NULL; | ||
903 | static struct kimage *kexec_crash_image = NULL; | ||
904 | /* | ||
905 | * A home grown binary mutex. | ||
906 | * Nothing can wait so this mutex is safe to use | ||
907 | * in interrupt context :) | ||
908 | */ | ||
909 | static int kexec_lock = 0; | ||
910 | |||
911 | asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, | ||
912 | struct kexec_segment __user *segments, | ||
913 | unsigned long flags) | ||
914 | { | ||
915 | struct kimage **dest_image, *image; | ||
916 | int locked; | ||
917 | int result; | ||
918 | |||
919 | /* We only trust the superuser with rebooting the system. */ | ||
920 | if (!capable(CAP_SYS_BOOT)) | ||
921 | return -EPERM; | ||
922 | |||
923 | /* | ||
924 | * Verify we have a legal set of flags | ||
925 | * This leaves us room for future extensions. | ||
926 | */ | ||
927 | if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) | ||
928 | return -EINVAL; | ||
929 | |||
930 | /* Verify we are on the appropriate architecture */ | ||
931 | if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && | ||
932 | ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) | ||
933 | return -EINVAL; | ||
934 | |||
935 | /* Put an artificial cap on the number | ||
936 | * of segments passed to kexec_load. | ||
937 | */ | ||
938 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
939 | return -EINVAL; | ||
940 | |||
941 | image = NULL; | ||
942 | result = 0; | ||
943 | |||
944 | /* Because we write directly to the reserved memory | ||
945 | * region when loading crash kernels we need a mutex here to | ||
946 | * prevent multiple crash kernels from attempting to load | ||
947 | * simultaneously, and to prevent a crash kernel from loading | ||
948 | * over the top of a in use crash kernel. | ||
949 | * | ||
950 | * KISS: always take the mutex. | ||
951 | */ | ||
952 | locked = xchg(&kexec_lock, 1); | ||
953 | if (locked) | ||
954 | return -EBUSY; | ||
955 | |||
956 | dest_image = &kexec_image; | ||
957 | if (flags & KEXEC_ON_CRASH) | ||
958 | dest_image = &kexec_crash_image; | ||
959 | if (nr_segments > 0) { | ||
960 | unsigned long i; | ||
961 | |||
962 | /* Loading another kernel to reboot into */ | ||
963 | if ((flags & KEXEC_ON_CRASH) == 0) | ||
964 | result = kimage_normal_alloc(&image, entry, | ||
965 | nr_segments, segments); | ||
966 | /* Loading another kernel to switch to if this one crashes */ | ||
967 | else if (flags & KEXEC_ON_CRASH) { | ||
968 | /* Free any current crash dump kernel before | ||
969 | * we corrupt it. | ||
970 | */ | ||
971 | kimage_free(xchg(&kexec_crash_image, NULL)); | ||
972 | result = kimage_crash_alloc(&image, entry, | ||
973 | nr_segments, segments); | ||
974 | } | ||
975 | if (result) | ||
976 | goto out; | ||
977 | |||
978 | result = machine_kexec_prepare(image); | ||
979 | if (result) | ||
980 | goto out; | ||
981 | |||
982 | for (i = 0; i < nr_segments; i++) { | ||
983 | result = kimage_load_segment(image, &image->segment[i]); | ||
984 | if (result) | ||
985 | goto out; | ||
986 | } | ||
987 | result = kimage_terminate(image); | ||
988 | if (result) | ||
989 | goto out; | ||
990 | } | ||
991 | /* Install the new kernel, and Uninstall the old */ | ||
992 | image = xchg(dest_image, image); | ||
993 | |||
994 | out: | ||
995 | xchg(&kexec_lock, 0); /* Release the mutex */ | ||
996 | kimage_free(image); | ||
997 | |||
998 | return result; | ||
999 | } | ||
1000 | |||
1001 | #ifdef CONFIG_COMPAT | ||
1002 | asmlinkage long compat_sys_kexec_load(unsigned long entry, | ||
1003 | unsigned long nr_segments, | ||
1004 | struct compat_kexec_segment __user *segments, | ||
1005 | unsigned long flags) | ||
1006 | { | ||
1007 | struct compat_kexec_segment in; | ||
1008 | struct kexec_segment out, __user *ksegments; | ||
1009 | unsigned long i, result; | ||
1010 | |||
1011 | /* Don't allow clients that don't understand the native | ||
1012 | * architecture to do anything. | ||
1013 | */ | ||
1014 | if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) | ||
1015 | return -EINVAL; | ||
1016 | |||
1017 | if (nr_segments > KEXEC_SEGMENT_MAX) | ||
1018 | return -EINVAL; | ||
1019 | |||
1020 | ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); | ||
1021 | for (i=0; i < nr_segments; i++) { | ||
1022 | result = copy_from_user(&in, &segments[i], sizeof(in)); | ||
1023 | if (result) | ||
1024 | return -EFAULT; | ||
1025 | |||
1026 | out.buf = compat_ptr(in.buf); | ||
1027 | out.bufsz = in.bufsz; | ||
1028 | out.mem = in.mem; | ||
1029 | out.memsz = in.memsz; | ||
1030 | |||
1031 | result = copy_to_user(&ksegments[i], &out, sizeof(out)); | ||
1032 | if (result) | ||
1033 | return -EFAULT; | ||
1034 | } | ||
1035 | |||
1036 | return sys_kexec_load(entry, nr_segments, ksegments, flags); | ||
1037 | } | ||
1038 | #endif | ||
1039 | |||
1040 | void crash_kexec(struct pt_regs *regs) | ||
1041 | { | ||
1042 | struct kimage *image; | ||
1043 | int locked; | ||
1044 | |||
1045 | |||
1046 | /* Take the kexec_lock here to prevent sys_kexec_load | ||
1047 | * running on one cpu from replacing the crash kernel | ||
1048 | * we are using after a panic on a different cpu. | ||
1049 | * | ||
1050 | * If the crash kernel was not located in a fixed area | ||
1051 | * of memory the xchg(&kexec_crash_image) would be | ||
1052 | * sufficient. But since I reuse the memory... | ||
1053 | */ | ||
1054 | locked = xchg(&kexec_lock, 1); | ||
1055 | if (!locked) { | ||
1056 | image = xchg(&kexec_crash_image, NULL); | ||
1057 | if (image) { | ||
1058 | machine_crash_shutdown(regs); | ||
1059 | machine_kexec(image); | ||
1060 | } | ||
1061 | xchg(&kexec_lock, 0); | ||
1062 | } | ||
1063 | } | ||
diff --git a/kernel/kmod.c b/kernel/kmod.c index eed53d4f5230..44166e3bb8af 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -120,6 +120,7 @@ struct subprocess_info { | |||
120 | char *path; | 120 | char *path; |
121 | char **argv; | 121 | char **argv; |
122 | char **envp; | 122 | char **envp; |
123 | struct key *ring; | ||
123 | int wait; | 124 | int wait; |
124 | int retval; | 125 | int retval; |
125 | }; | 126 | }; |
@@ -130,16 +131,21 @@ struct subprocess_info { | |||
130 | static int ____call_usermodehelper(void *data) | 131 | static int ____call_usermodehelper(void *data) |
131 | { | 132 | { |
132 | struct subprocess_info *sub_info = data; | 133 | struct subprocess_info *sub_info = data; |
134 | struct key *old_session; | ||
133 | int retval; | 135 | int retval; |
134 | 136 | ||
135 | /* Unblock all signals. */ | 137 | /* Unblock all signals and set the session keyring. */ |
138 | key_get(sub_info->ring); | ||
136 | flush_signals(current); | 139 | flush_signals(current); |
137 | spin_lock_irq(¤t->sighand->siglock); | 140 | spin_lock_irq(¤t->sighand->siglock); |
141 | old_session = __install_session_keyring(current, sub_info->ring); | ||
138 | flush_signal_handlers(current, 1); | 142 | flush_signal_handlers(current, 1); |
139 | sigemptyset(¤t->blocked); | 143 | sigemptyset(¤t->blocked); |
140 | recalc_sigpending(); | 144 | recalc_sigpending(); |
141 | spin_unlock_irq(¤t->sighand->siglock); | 145 | spin_unlock_irq(¤t->sighand->siglock); |
142 | 146 | ||
147 | key_put(old_session); | ||
148 | |||
143 | /* We can run anywhere, unlike our parent keventd(). */ | 149 | /* We can run anywhere, unlike our parent keventd(). */ |
144 | set_cpus_allowed(current, CPU_MASK_ALL); | 150 | set_cpus_allowed(current, CPU_MASK_ALL); |
145 | 151 | ||
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data) | |||
211 | } | 217 | } |
212 | 218 | ||
213 | /** | 219 | /** |
214 | * call_usermodehelper - start a usermode application | 220 | * call_usermodehelper_keys - start a usermode application |
215 | * @path: pathname for the application | 221 | * @path: pathname for the application |
216 | * @argv: null-terminated argument list | 222 | * @argv: null-terminated argument list |
217 | * @envp: null-terminated environment list | 223 | * @envp: null-terminated environment list |
224 | * @session_keyring: session keyring for process (NULL for an empty keyring) | ||
218 | * @wait: wait for the application to finish and return status. | 225 | * @wait: wait for the application to finish and return status. |
219 | * | 226 | * |
220 | * Runs a user-space application. The application is started | 227 | * Runs a user-space application. The application is started |
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data) | |||
224 | * Must be called from process context. Returns a negative error code | 231 | * Must be called from process context. Returns a negative error code |
225 | * if program was not execed successfully, or 0. | 232 | * if program was not execed successfully, or 0. |
226 | */ | 233 | */ |
227 | int call_usermodehelper(char *path, char **argv, char **envp, int wait) | 234 | int call_usermodehelper_keys(char *path, char **argv, char **envp, |
235 | struct key *session_keyring, int wait) | ||
228 | { | 236 | { |
229 | DECLARE_COMPLETION(done); | 237 | DECLARE_COMPLETION(done); |
230 | struct subprocess_info sub_info = { | 238 | struct subprocess_info sub_info = { |
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) | |||
232 | .path = path, | 240 | .path = path, |
233 | .argv = argv, | 241 | .argv = argv, |
234 | .envp = envp, | 242 | .envp = envp, |
243 | .ring = session_keyring, | ||
235 | .wait = wait, | 244 | .wait = wait, |
236 | .retval = 0, | 245 | .retval = 0, |
237 | }; | 246 | }; |
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) | |||
247 | wait_for_completion(&done); | 256 | wait_for_completion(&done); |
248 | return sub_info.retval; | 257 | return sub_info.retval; |
249 | } | 258 | } |
250 | EXPORT_SYMBOL(call_usermodehelper); | 259 | EXPORT_SYMBOL(call_usermodehelper_keys); |
251 | 260 | ||
252 | void __init usermodehelper_init(void) | 261 | void __init usermodehelper_init(void) |
253 | { | 262 | { |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 037142b72a49..b0237122b24e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -27,12 +27,16 @@ | |||
27 | * interface to access function arguments. | 27 | * interface to access function arguments. |
28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes | 28 | * 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes |
29 | * exceptions notifier to be first on the priority list. | 29 | * exceptions notifier to be first on the priority list. |
30 | * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston | ||
31 | * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi | ||
32 | * <prasanna@in.ibm.com> added function-return probes. | ||
30 | */ | 33 | */ |
31 | #include <linux/kprobes.h> | 34 | #include <linux/kprobes.h> |
32 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
33 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
34 | #include <linux/init.h> | 37 | #include <linux/init.h> |
35 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | ||
36 | #include <asm/cacheflush.h> | 40 | #include <asm/cacheflush.h> |
37 | #include <asm/errno.h> | 41 | #include <asm/errno.h> |
38 | #include <asm/kdebug.h> | 42 | #include <asm/kdebug.h> |
@@ -41,11 +45,112 @@ | |||
41 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) | 45 | #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) |
42 | 46 | ||
43 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 47 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
48 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | ||
44 | 49 | ||
45 | unsigned int kprobe_cpu = NR_CPUS; | 50 | unsigned int kprobe_cpu = NR_CPUS; |
46 | static DEFINE_SPINLOCK(kprobe_lock); | 51 | static DEFINE_SPINLOCK(kprobe_lock); |
47 | static struct kprobe *curr_kprobe; | 52 | static struct kprobe *curr_kprobe; |
48 | 53 | ||
54 | /* | ||
55 | * kprobe->ainsn.insn points to the copy of the instruction to be | ||
56 | * single-stepped. x86_64, POWER4 and above have no-exec support and | ||
57 | * stepping on the instruction on a vmalloced/kmalloced/data page | ||
58 | * is a recipe for disaster | ||
59 | */ | ||
60 | #define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) | ||
61 | |||
62 | struct kprobe_insn_page { | ||
63 | struct hlist_node hlist; | ||
64 | kprobe_opcode_t *insns; /* Page of instruction slots */ | ||
65 | char slot_used[INSNS_PER_PAGE]; | ||
66 | int nused; | ||
67 | }; | ||
68 | |||
69 | static struct hlist_head kprobe_insn_pages; | ||
70 | |||
71 | /** | ||
72 | * get_insn_slot() - Find a slot on an executable page for an instruction. | ||
73 | * We allocate an executable page if there's no room on existing ones. | ||
74 | */ | ||
75 | kprobe_opcode_t *get_insn_slot(void) | ||
76 | { | ||
77 | struct kprobe_insn_page *kip; | ||
78 | struct hlist_node *pos; | ||
79 | |||
80 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
81 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
82 | if (kip->nused < INSNS_PER_PAGE) { | ||
83 | int i; | ||
84 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
85 | if (!kip->slot_used[i]) { | ||
86 | kip->slot_used[i] = 1; | ||
87 | kip->nused++; | ||
88 | return kip->insns + (i * MAX_INSN_SIZE); | ||
89 | } | ||
90 | } | ||
91 | /* Surprise! No unused slots. Fix kip->nused. */ | ||
92 | kip->nused = INSNS_PER_PAGE; | ||
93 | } | ||
94 | } | ||
95 | |||
96 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | ||
97 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | ||
98 | if (!kip) { | ||
99 | return NULL; | ||
100 | } | ||
101 | |||
102 | /* | ||
103 | * Use module_alloc so this page is within +/- 2GB of where the | ||
104 | * kernel image and loaded module images reside. This is required | ||
105 | * so x86_64 can correctly handle the %rip-relative fixups. | ||
106 | */ | ||
107 | kip->insns = module_alloc(PAGE_SIZE); | ||
108 | if (!kip->insns) { | ||
109 | kfree(kip); | ||
110 | return NULL; | ||
111 | } | ||
112 | INIT_HLIST_NODE(&kip->hlist); | ||
113 | hlist_add_head(&kip->hlist, &kprobe_insn_pages); | ||
114 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | ||
115 | kip->slot_used[0] = 1; | ||
116 | kip->nused = 1; | ||
117 | return kip->insns; | ||
118 | } | ||
119 | |||
120 | void free_insn_slot(kprobe_opcode_t *slot) | ||
121 | { | ||
122 | struct kprobe_insn_page *kip; | ||
123 | struct hlist_node *pos; | ||
124 | |||
125 | hlist_for_each(pos, &kprobe_insn_pages) { | ||
126 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
127 | if (kip->insns <= slot && | ||
128 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | ||
129 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | ||
130 | kip->slot_used[i] = 0; | ||
131 | kip->nused--; | ||
132 | if (kip->nused == 0) { | ||
133 | /* | ||
134 | * Page is no longer in use. Free it unless | ||
135 | * it's the last one. We keep the last one | ||
136 | * so as not to have to set it up again the | ||
137 | * next time somebody inserts a probe. | ||
138 | */ | ||
139 | hlist_del(&kip->hlist); | ||
140 | if (hlist_empty(&kprobe_insn_pages)) { | ||
141 | INIT_HLIST_NODE(&kip->hlist); | ||
142 | hlist_add_head(&kip->hlist, | ||
143 | &kprobe_insn_pages); | ||
144 | } else { | ||
145 | module_free(NULL, kip->insns); | ||
146 | kfree(kip); | ||
147 | } | ||
148 | } | ||
149 | return; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
49 | /* Locks kprobe: irqs must be disabled */ | 154 | /* Locks kprobe: irqs must be disabled */ |
50 | void lock_kprobes(void) | 155 | void lock_kprobes(void) |
51 | { | 156 | { |
@@ -78,22 +183,23 @@ struct kprobe *get_kprobe(void *addr) | |||
78 | * Aggregate handlers for multiple kprobes support - these handlers | 183 | * Aggregate handlers for multiple kprobes support - these handlers |
79 | * take care of invoking the individual kprobe handlers on p->list | 184 | * take care of invoking the individual kprobe handlers on p->list |
80 | */ | 185 | */ |
81 | int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 186 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
82 | { | 187 | { |
83 | struct kprobe *kp; | 188 | struct kprobe *kp; |
84 | 189 | ||
85 | list_for_each_entry(kp, &p->list, list) { | 190 | list_for_each_entry(kp, &p->list, list) { |
86 | if (kp->pre_handler) { | 191 | if (kp->pre_handler) { |
87 | curr_kprobe = kp; | 192 | curr_kprobe = kp; |
88 | kp->pre_handler(kp, regs); | 193 | if (kp->pre_handler(kp, regs)) |
89 | curr_kprobe = NULL; | 194 | return 1; |
90 | } | 195 | } |
196 | curr_kprobe = NULL; | ||
91 | } | 197 | } |
92 | return 0; | 198 | return 0; |
93 | } | 199 | } |
94 | 200 | ||
95 | void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 201 | static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
96 | unsigned long flags) | 202 | unsigned long flags) |
97 | { | 203 | { |
98 | struct kprobe *kp; | 204 | struct kprobe *kp; |
99 | 205 | ||
@@ -107,7 +213,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
107 | return; | 213 | return; |
108 | } | 214 | } |
109 | 215 | ||
110 | int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) | 216 | static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
217 | int trapnr) | ||
111 | { | 218 | { |
112 | /* | 219 | /* |
113 | * if we faulted "during" the execution of a user specified | 220 | * if we faulted "during" the execution of a user specified |
@@ -120,19 +227,159 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr) | |||
120 | return 0; | 227 | return 0; |
121 | } | 228 | } |
122 | 229 | ||
230 | static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | ||
231 | { | ||
232 | struct kprobe *kp = curr_kprobe; | ||
233 | if (curr_kprobe && kp->break_handler) { | ||
234 | if (kp->break_handler(kp, regs)) { | ||
235 | curr_kprobe = NULL; | ||
236 | return 1; | ||
237 | } | ||
238 | } | ||
239 | curr_kprobe = NULL; | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | ||
244 | { | ||
245 | struct hlist_node *node; | ||
246 | struct kretprobe_instance *ri; | ||
247 | hlist_for_each_entry(ri, node, &rp->free_instances, uflist) | ||
248 | return ri; | ||
249 | return NULL; | ||
250 | } | ||
251 | |||
252 | static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | ||
253 | { | ||
254 | struct hlist_node *node; | ||
255 | struct kretprobe_instance *ri; | ||
256 | hlist_for_each_entry(ri, node, &rp->used_instances, uflist) | ||
257 | return ri; | ||
258 | return NULL; | ||
259 | } | ||
260 | |||
261 | void add_rp_inst(struct kretprobe_instance *ri) | ||
262 | { | ||
263 | /* | ||
264 | * Remove rp inst off the free list - | ||
265 | * Add it back when probed function returns | ||
266 | */ | ||
267 | hlist_del(&ri->uflist); | ||
268 | |||
269 | /* Add rp inst onto table */ | ||
270 | INIT_HLIST_NODE(&ri->hlist); | ||
271 | hlist_add_head(&ri->hlist, | ||
272 | &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]); | ||
273 | |||
274 | /* Also add this rp inst to the used list. */ | ||
275 | INIT_HLIST_NODE(&ri->uflist); | ||
276 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | ||
277 | } | ||
278 | |||
279 | void recycle_rp_inst(struct kretprobe_instance *ri) | ||
280 | { | ||
281 | /* remove rp inst off the rprobe_inst_table */ | ||
282 | hlist_del(&ri->hlist); | ||
283 | if (ri->rp) { | ||
284 | /* remove rp inst off the used list */ | ||
285 | hlist_del(&ri->uflist); | ||
286 | /* put rp inst back onto the free list */ | ||
287 | INIT_HLIST_NODE(&ri->uflist); | ||
288 | hlist_add_head(&ri->uflist, &ri->rp->free_instances); | ||
289 | } else | ||
290 | /* Unregistering */ | ||
291 | kfree(ri); | ||
292 | } | ||
293 | |||
294 | struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | ||
295 | { | ||
296 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * This function is called from exit_thread or flush_thread when task tk's | ||
301 | * stack is being recycled so that we can recycle any function-return probe | ||
302 | * instances associated with this task. These left over instances represent | ||
303 | * probed functions that have been called but will never return. | ||
304 | */ | ||
305 | void kprobe_flush_task(struct task_struct *tk) | ||
306 | { | ||
307 | struct kretprobe_instance *ri; | ||
308 | struct hlist_head *head; | ||
309 | struct hlist_node *node, *tmp; | ||
310 | unsigned long flags = 0; | ||
311 | |||
312 | spin_lock_irqsave(&kprobe_lock, flags); | ||
313 | head = kretprobe_inst_table_head(current); | ||
314 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | ||
315 | if (ri->task == tk) | ||
316 | recycle_rp_inst(ri); | ||
317 | } | ||
318 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * This kprobe pre_handler is registered with every kretprobe. When probe | ||
323 | * hits it will set up the return probe. | ||
324 | */ | ||
325 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | ||
326 | { | ||
327 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | ||
328 | |||
329 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | ||
330 | arch_prepare_kretprobe(rp, regs); | ||
331 | return 0; | ||
332 | } | ||
333 | |||
334 | static inline void free_rp_inst(struct kretprobe *rp) | ||
335 | { | ||
336 | struct kretprobe_instance *ri; | ||
337 | while ((ri = get_free_rp_inst(rp)) != NULL) { | ||
338 | hlist_del(&ri->uflist); | ||
339 | kfree(ri); | ||
340 | } | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Keep all fields in the kprobe consistent | ||
345 | */ | ||
346 | static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
347 | { | ||
348 | memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); | ||
349 | memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); | ||
350 | } | ||
351 | |||
352 | /* | ||
353 | * Add the new probe to old_p->list. Fail if this is the | ||
354 | * second jprobe at the address - two jprobes can't coexist | ||
355 | */ | ||
356 | static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | ||
357 | { | ||
358 | struct kprobe *kp; | ||
359 | |||
360 | if (p->break_handler) { | ||
361 | list_for_each_entry(kp, &old_p->list, list) { | ||
362 | if (kp->break_handler) | ||
363 | return -EEXIST; | ||
364 | } | ||
365 | list_add_tail(&p->list, &old_p->list); | ||
366 | } else | ||
367 | list_add(&p->list, &old_p->list); | ||
368 | return 0; | ||
369 | } | ||
370 | |||
123 | /* | 371 | /* |
124 | * Fill in the required fields of the "manager kprobe". Replace the | 372 | * Fill in the required fields of the "manager kprobe". Replace the |
125 | * earlier kprobe in the hlist with the manager kprobe | 373 | * earlier kprobe in the hlist with the manager kprobe |
126 | */ | 374 | */ |
127 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | 375 | static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) |
128 | { | 376 | { |
377 | copy_kprobe(p, ap); | ||
129 | ap->addr = p->addr; | 378 | ap->addr = p->addr; |
130 | ap->opcode = p->opcode; | ||
131 | memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn)); | ||
132 | |||
133 | ap->pre_handler = aggr_pre_handler; | 379 | ap->pre_handler = aggr_pre_handler; |
134 | ap->post_handler = aggr_post_handler; | 380 | ap->post_handler = aggr_post_handler; |
135 | ap->fault_handler = aggr_fault_handler; | 381 | ap->fault_handler = aggr_fault_handler; |
382 | ap->break_handler = aggr_break_handler; | ||
136 | 383 | ||
137 | INIT_LIST_HEAD(&ap->list); | 384 | INIT_LIST_HEAD(&ap->list); |
138 | list_add(&p->list, &ap->list); | 385 | list_add(&p->list, &ap->list); |
@@ -153,16 +400,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
153 | int ret = 0; | 400 | int ret = 0; |
154 | struct kprobe *ap; | 401 | struct kprobe *ap; |
155 | 402 | ||
156 | if (old_p->break_handler || p->break_handler) { | 403 | if (old_p->pre_handler == aggr_pre_handler) { |
157 | ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */ | 404 | copy_kprobe(old_p, p); |
158 | } else if (old_p->pre_handler == aggr_pre_handler) { | 405 | ret = add_new_kprobe(old_p, p); |
159 | list_add(&p->list, &old_p->list); | ||
160 | } else { | 406 | } else { |
161 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); | 407 | ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC); |
162 | if (!ap) | 408 | if (!ap) |
163 | return -ENOMEM; | 409 | return -ENOMEM; |
164 | add_aggr_kprobe(ap, old_p); | 410 | add_aggr_kprobe(ap, old_p); |
165 | list_add(&p->list, &ap->list); | 411 | copy_kprobe(ap, p); |
412 | ret = add_new_kprobe(ap, p); | ||
166 | } | 413 | } |
167 | return ret; | 414 | return ret; |
168 | } | 415 | } |
@@ -170,10 +417,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
170 | /* kprobe removal house-keeping routines */ | 417 | /* kprobe removal house-keeping routines */ |
171 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | 418 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) |
172 | { | 419 | { |
173 | *p->addr = p->opcode; | 420 | arch_disarm_kprobe(p); |
174 | hlist_del(&p->hlist); | 421 | hlist_del(&p->hlist); |
175 | flush_icache_range((unsigned long) p->addr, | ||
176 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
177 | spin_unlock_irqrestore(&kprobe_lock, flags); | 422 | spin_unlock_irqrestore(&kprobe_lock, flags); |
178 | arch_remove_kprobe(p); | 423 | arch_remove_kprobe(p); |
179 | } | 424 | } |
@@ -200,6 +445,7 @@ int register_kprobe(struct kprobe *p) | |||
200 | } | 445 | } |
201 | spin_lock_irqsave(&kprobe_lock, flags); | 446 | spin_lock_irqsave(&kprobe_lock, flags); |
202 | old_p = get_kprobe(p->addr); | 447 | old_p = get_kprobe(p->addr); |
448 | p->nmissed = 0; | ||
203 | if (old_p) { | 449 | if (old_p) { |
204 | ret = register_aggr_kprobe(old_p, p); | 450 | ret = register_aggr_kprobe(old_p, p); |
205 | goto out; | 451 | goto out; |
@@ -210,10 +456,8 @@ int register_kprobe(struct kprobe *p) | |||
210 | hlist_add_head(&p->hlist, | 456 | hlist_add_head(&p->hlist, |
211 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 457 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
212 | 458 | ||
213 | p->opcode = *p->addr; | 459 | arch_arm_kprobe(p); |
214 | *p->addr = BREAKPOINT_INSTRUCTION; | 460 | |
215 | flush_icache_range((unsigned long) p->addr, | ||
216 | (unsigned long) p->addr + sizeof(kprobe_opcode_t)); | ||
217 | out: | 461 | out: |
218 | spin_unlock_irqrestore(&kprobe_lock, flags); | 462 | spin_unlock_irqrestore(&kprobe_lock, flags); |
219 | rm_kprobe: | 463 | rm_kprobe: |
@@ -257,16 +501,83 @@ void unregister_jprobe(struct jprobe *jp) | |||
257 | unregister_kprobe(&jp->kp); | 501 | unregister_kprobe(&jp->kp); |
258 | } | 502 | } |
259 | 503 | ||
504 | #ifdef ARCH_SUPPORTS_KRETPROBES | ||
505 | |||
506 | int register_kretprobe(struct kretprobe *rp) | ||
507 | { | ||
508 | int ret = 0; | ||
509 | struct kretprobe_instance *inst; | ||
510 | int i; | ||
511 | |||
512 | rp->kp.pre_handler = pre_handler_kretprobe; | ||
513 | |||
514 | /* Pre-allocate memory for max kretprobe instances */ | ||
515 | if (rp->maxactive <= 0) { | ||
516 | #ifdef CONFIG_PREEMPT | ||
517 | rp->maxactive = max(10, 2 * NR_CPUS); | ||
518 | #else | ||
519 | rp->maxactive = NR_CPUS; | ||
520 | #endif | ||
521 | } | ||
522 | INIT_HLIST_HEAD(&rp->used_instances); | ||
523 | INIT_HLIST_HEAD(&rp->free_instances); | ||
524 | for (i = 0; i < rp->maxactive; i++) { | ||
525 | inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL); | ||
526 | if (inst == NULL) { | ||
527 | free_rp_inst(rp); | ||
528 | return -ENOMEM; | ||
529 | } | ||
530 | INIT_HLIST_NODE(&inst->uflist); | ||
531 | hlist_add_head(&inst->uflist, &rp->free_instances); | ||
532 | } | ||
533 | |||
534 | rp->nmissed = 0; | ||
535 | /* Establish function entry probe point */ | ||
536 | if ((ret = register_kprobe(&rp->kp)) != 0) | ||
537 | free_rp_inst(rp); | ||
538 | return ret; | ||
539 | } | ||
540 | |||
541 | #else /* ARCH_SUPPORTS_KRETPROBES */ | ||
542 | |||
543 | int register_kretprobe(struct kretprobe *rp) | ||
544 | { | ||
545 | return -ENOSYS; | ||
546 | } | ||
547 | |||
548 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | ||
549 | |||
550 | void unregister_kretprobe(struct kretprobe *rp) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | struct kretprobe_instance *ri; | ||
554 | |||
555 | unregister_kprobe(&rp->kp); | ||
556 | /* No race here */ | ||
557 | spin_lock_irqsave(&kprobe_lock, flags); | ||
558 | free_rp_inst(rp); | ||
559 | while ((ri = get_used_rp_inst(rp)) != NULL) { | ||
560 | ri->rp = NULL; | ||
561 | hlist_del(&ri->uflist); | ||
562 | } | ||
563 | spin_unlock_irqrestore(&kprobe_lock, flags); | ||
564 | } | ||
565 | |||
260 | static int __init init_kprobes(void) | 566 | static int __init init_kprobes(void) |
261 | { | 567 | { |
262 | int i, err = 0; | 568 | int i, err = 0; |
263 | 569 | ||
264 | /* FIXME allocate the probe table, currently defined statically */ | 570 | /* FIXME allocate the probe table, currently defined statically */ |
265 | /* initialize all list heads */ | 571 | /* initialize all list heads */ |
266 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) | 572 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
267 | INIT_HLIST_HEAD(&kprobe_table[i]); | 573 | INIT_HLIST_HEAD(&kprobe_table[i]); |
574 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | ||
575 | } | ||
576 | |||
577 | err = arch_init_kprobes(); | ||
578 | if (!err) | ||
579 | err = register_die_notifier(&kprobe_exceptions_nb); | ||
268 | 580 | ||
269 | err = register_die_notifier(&kprobe_exceptions_nb); | ||
270 | return err; | 581 | return err; |
271 | } | 582 | } |
272 | 583 | ||
@@ -277,3 +588,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe); | |||
277 | EXPORT_SYMBOL_GPL(register_jprobe); | 588 | EXPORT_SYMBOL_GPL(register_jprobe); |
278 | EXPORT_SYMBOL_GPL(unregister_jprobe); | 589 | EXPORT_SYMBOL_GPL(unregister_jprobe); |
279 | EXPORT_SYMBOL_GPL(jprobe_return); | 590 | EXPORT_SYMBOL_GPL(jprobe_return); |
591 | EXPORT_SYMBOL_GPL(register_kretprobe); | ||
592 | EXPORT_SYMBOL_GPL(unregister_kretprobe); | ||
593 | |||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 1f064a63f8cf..015fb69ad94d 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) | |||
30 | KERNEL_ATTR_RO(hotplug_seqnum); | 30 | KERNEL_ATTR_RO(hotplug_seqnum); |
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | #ifdef CONFIG_KEXEC | ||
34 | #include <asm/kexec.h> | ||
35 | |||
36 | static ssize_t crash_notes_show(struct subsystem *subsys, char *page) | ||
37 | { | ||
38 | return sprintf(page, "%p\n", (void *)crash_notes); | ||
39 | } | ||
40 | KERNEL_ATTR_RO(crash_notes); | ||
41 | #endif | ||
42 | |||
33 | decl_subsys(kernel, NULL, NULL); | 43 | decl_subsys(kernel, NULL, NULL); |
34 | EXPORT_SYMBOL_GPL(kernel_subsys); | 44 | EXPORT_SYMBOL_GPL(kernel_subsys); |
35 | 45 | ||
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = { | |||
37 | #ifdef CONFIG_HOTPLUG | 47 | #ifdef CONFIG_HOTPLUG |
38 | &hotplug_seqnum_attr.attr, | 48 | &hotplug_seqnum_attr.attr, |
39 | #endif | 49 | #endif |
50 | #ifdef CONFIG_KEXEC | ||
51 | &crash_notes_attr.attr, | ||
52 | #endif | ||
40 | NULL | 53 | NULL |
41 | }; | 54 | }; |
42 | 55 | ||
diff --git a/kernel/module.c b/kernel/module.c index a566745dde62..c32995fbd8fd 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
36 | #include <linux/stop_machine.h> | 36 | #include <linux/stop_machine.h> |
37 | #include <linux/device.h> | 37 | #include <linux/device.h> |
38 | #include <linux/string.h> | ||
38 | #include <asm/uaccess.h> | 39 | #include <asm/uaccess.h> |
39 | #include <asm/semaphore.h> | 40 | #include <asm/semaphore.h> |
40 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
@@ -249,13 +250,18 @@ static inline unsigned int block_size(int val) | |||
249 | /* Created by linker magic */ | 250 | /* Created by linker magic */ |
250 | extern char __per_cpu_start[], __per_cpu_end[]; | 251 | extern char __per_cpu_start[], __per_cpu_end[]; |
251 | 252 | ||
252 | static void *percpu_modalloc(unsigned long size, unsigned long align) | 253 | static void *percpu_modalloc(unsigned long size, unsigned long align, |
254 | const char *name) | ||
253 | { | 255 | { |
254 | unsigned long extra; | 256 | unsigned long extra; |
255 | unsigned int i; | 257 | unsigned int i; |
256 | void *ptr; | 258 | void *ptr; |
257 | 259 | ||
258 | BUG_ON(align > SMP_CACHE_BYTES); | 260 | if (align > SMP_CACHE_BYTES) { |
261 | printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", | ||
262 | name, align, SMP_CACHE_BYTES); | ||
263 | align = SMP_CACHE_BYTES; | ||
264 | } | ||
259 | 265 | ||
260 | ptr = __per_cpu_start; | 266 | ptr = __per_cpu_start; |
261 | for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { | 267 | for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { |
@@ -347,7 +353,8 @@ static int percpu_modinit(void) | |||
347 | } | 353 | } |
348 | __initcall(percpu_modinit); | 354 | __initcall(percpu_modinit); |
349 | #else /* ... !CONFIG_SMP */ | 355 | #else /* ... !CONFIG_SMP */ |
350 | static inline void *percpu_modalloc(unsigned long size, unsigned long align) | 356 | static inline void *percpu_modalloc(unsigned long size, unsigned long align, |
357 | const char *name) | ||
351 | { | 358 | { |
352 | return NULL; | 359 | return NULL; |
353 | } | 360 | } |
@@ -370,6 +377,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, | |||
370 | #endif /* CONFIG_SMP */ | 377 | #endif /* CONFIG_SMP */ |
371 | 378 | ||
372 | #ifdef CONFIG_MODULE_UNLOAD | 379 | #ifdef CONFIG_MODULE_UNLOAD |
380 | #define MODINFO_ATTR(field) \ | ||
381 | static void setup_modinfo_##field(struct module *mod, const char *s) \ | ||
382 | { \ | ||
383 | mod->field = kstrdup(s, GFP_KERNEL); \ | ||
384 | } \ | ||
385 | static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ | ||
386 | struct module *mod, char *buffer) \ | ||
387 | { \ | ||
388 | return sprintf(buffer, "%s\n", mod->field); \ | ||
389 | } \ | ||
390 | static int modinfo_##field##_exists(struct module *mod) \ | ||
391 | { \ | ||
392 | return mod->field != NULL; \ | ||
393 | } \ | ||
394 | static void free_modinfo_##field(struct module *mod) \ | ||
395 | { \ | ||
396 | kfree(mod->field); \ | ||
397 | mod->field = NULL; \ | ||
398 | } \ | ||
399 | static struct module_attribute modinfo_##field = { \ | ||
400 | .attr = { .name = __stringify(field), .mode = 0444, \ | ||
401 | .owner = THIS_MODULE }, \ | ||
402 | .show = show_modinfo_##field, \ | ||
403 | .setup = setup_modinfo_##field, \ | ||
404 | .test = modinfo_##field##_exists, \ | ||
405 | .free = free_modinfo_##field, \ | ||
406 | }; | ||
407 | |||
408 | MODINFO_ATTR(version); | ||
409 | MODINFO_ATTR(srcversion); | ||
410 | |||
411 | static struct module_attribute *modinfo_attrs[] = { | ||
412 | &modinfo_version, | ||
413 | &modinfo_srcversion, | ||
414 | NULL, | ||
415 | }; | ||
416 | |||
373 | /* Init the unload section of the module. */ | 417 | /* Init the unload section of the module. */ |
374 | static void module_unload_init(struct module *mod) | 418 | static void module_unload_init(struct module *mod) |
375 | { | 419 | { |
@@ -692,7 +736,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp) | |||
692 | return 0; | 736 | return 0; |
693 | } | 737 | } |
694 | 738 | ||
695 | int set_obsolete(const char *val, struct kernel_param *kp) | 739 | static int set_obsolete(const char *val, struct kernel_param *kp) |
696 | { | 740 | { |
697 | unsigned int min, max; | 741 | unsigned int min, max; |
698 | unsigned int size, maxsize; | 742 | unsigned int size, maxsize; |
@@ -1031,6 +1075,32 @@ static void module_remove_refcnt_attr(struct module *mod) | |||
1031 | } | 1075 | } |
1032 | #endif | 1076 | #endif |
1033 | 1077 | ||
1078 | #ifdef CONFIG_MODULE_UNLOAD | ||
1079 | static int module_add_modinfo_attrs(struct module *mod) | ||
1080 | { | ||
1081 | struct module_attribute *attr; | ||
1082 | int error = 0; | ||
1083 | int i; | ||
1084 | |||
1085 | for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { | ||
1086 | if (!attr->test || | ||
1087 | (attr->test && attr->test(mod))) | ||
1088 | error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr); | ||
1089 | } | ||
1090 | return error; | ||
1091 | } | ||
1092 | |||
1093 | static void module_remove_modinfo_attrs(struct module *mod) | ||
1094 | { | ||
1095 | struct module_attribute *attr; | ||
1096 | int i; | ||
1097 | |||
1098 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | ||
1099 | sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); | ||
1100 | attr->free(mod); | ||
1101 | } | ||
1102 | } | ||
1103 | #endif | ||
1034 | 1104 | ||
1035 | static int mod_sysfs_setup(struct module *mod, | 1105 | static int mod_sysfs_setup(struct module *mod, |
1036 | struct kernel_param *kparam, | 1106 | struct kernel_param *kparam, |
@@ -1056,6 +1126,12 @@ static int mod_sysfs_setup(struct module *mod, | |||
1056 | if (err) | 1126 | if (err) |
1057 | goto out_unreg; | 1127 | goto out_unreg; |
1058 | 1128 | ||
1129 | #ifdef CONFIG_MODULE_UNLOAD | ||
1130 | err = module_add_modinfo_attrs(mod); | ||
1131 | if (err) | ||
1132 | goto out_unreg; | ||
1133 | #endif | ||
1134 | |||
1059 | return 0; | 1135 | return 0; |
1060 | 1136 | ||
1061 | out_unreg: | 1137 | out_unreg: |
@@ -1066,6 +1142,9 @@ out: | |||
1066 | 1142 | ||
1067 | static void mod_kobject_remove(struct module *mod) | 1143 | static void mod_kobject_remove(struct module *mod) |
1068 | { | 1144 | { |
1145 | #ifdef CONFIG_MODULE_UNLOAD | ||
1146 | module_remove_modinfo_attrs(mod); | ||
1147 | #endif | ||
1069 | module_remove_refcnt_attr(mod); | 1148 | module_remove_refcnt_attr(mod); |
1070 | module_param_sysfs_remove(mod); | 1149 | module_param_sysfs_remove(mod); |
1071 | 1150 | ||
@@ -1311,6 +1390,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs, | |||
1311 | return NULL; | 1390 | return NULL; |
1312 | } | 1391 | } |
1313 | 1392 | ||
1393 | #ifdef CONFIG_MODULE_UNLOAD | ||
1394 | static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, | ||
1395 | unsigned int infoindex) | ||
1396 | { | ||
1397 | struct module_attribute *attr; | ||
1398 | int i; | ||
1399 | |||
1400 | for (i = 0; (attr = modinfo_attrs[i]); i++) { | ||
1401 | if (attr->setup) | ||
1402 | attr->setup(mod, | ||
1403 | get_modinfo(sechdrs, | ||
1404 | infoindex, | ||
1405 | attr->attr.name)); | ||
1406 | } | ||
1407 | } | ||
1408 | #endif | ||
1409 | |||
1314 | #ifdef CONFIG_KALLSYMS | 1410 | #ifdef CONFIG_KALLSYMS |
1315 | int is_exported(const char *name, const struct module *mod) | 1411 | int is_exported(const char *name, const struct module *mod) |
1316 | { | 1412 | { |
@@ -1554,7 +1650,8 @@ static struct module *load_module(void __user *umod, | |||
1554 | if (pcpuindex) { | 1650 | if (pcpuindex) { |
1555 | /* We have a special allocation for this section. */ | 1651 | /* We have a special allocation for this section. */ |
1556 | percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, | 1652 | percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, |
1557 | sechdrs[pcpuindex].sh_addralign); | 1653 | sechdrs[pcpuindex].sh_addralign, |
1654 | mod->name); | ||
1558 | if (!percpu) { | 1655 | if (!percpu) { |
1559 | err = -ENOMEM; | 1656 | err = -ENOMEM; |
1560 | goto free_mod; | 1657 | goto free_mod; |
@@ -1615,6 +1712,11 @@ static struct module *load_module(void __user *umod, | |||
1615 | /* Set up license info based on the info section */ | 1712 | /* Set up license info based on the info section */ |
1616 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1713 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1617 | 1714 | ||
1715 | #ifdef CONFIG_MODULE_UNLOAD | ||
1716 | /* Set up MODINFO_ATTR fields */ | ||
1717 | setup_modinfo(mod, sechdrs, infoindex); | ||
1718 | #endif | ||
1719 | |||
1618 | /* Fix up syms, so that st_value is a pointer to location. */ | 1720 | /* Fix up syms, so that st_value is a pointer to location. */ |
1619 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, | 1721 | err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex, |
1620 | mod); | 1722 | mod); |
diff --git a/kernel/panic.c b/kernel/panic.c index 081f7465fc8d..aabc5f86fa3f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/sysrq.h> | 18 | #include <linux/sysrq.h> |
19 | #include <linux/interrupt.h> | 19 | #include <linux/interrupt.h> |
20 | #include <linux/nmi.h> | 20 | #include <linux/nmi.h> |
21 | #include <linux/kexec.h> | ||
21 | 22 | ||
22 | int panic_timeout; | 23 | int panic_timeout; |
23 | int panic_on_oops; | 24 | int panic_on_oops; |
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
63 | unsigned long caller = (unsigned long) __builtin_return_address(0); | 64 | unsigned long caller = (unsigned long) __builtin_return_address(0); |
64 | #endif | 65 | #endif |
65 | 66 | ||
67 | /* | ||
68 | * It's possible to come here directly from a panic-assertion and not | ||
69 | * have preempt disabled. Some functions called from here want | ||
70 | * preempt to be disabled. No point enabling it later though... | ||
71 | */ | ||
72 | preempt_disable(); | ||
73 | |||
66 | bust_spinlocks(1); | 74 | bust_spinlocks(1); |
67 | va_start(args, fmt); | 75 | va_start(args, fmt); |
68 | vsnprintf(buf, sizeof(buf), fmt, args); | 76 | vsnprintf(buf, sizeof(buf), fmt, args); |
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
70 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); | 78 | printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); |
71 | bust_spinlocks(0); | 79 | bust_spinlocks(0); |
72 | 80 | ||
81 | /* | ||
82 | * If we have crashed and we have a crash kernel loaded let it handle | ||
83 | * everything else. | ||
84 | * Do we want to call this before we try to display a message? | ||
85 | */ | ||
86 | crash_kexec(NULL); | ||
87 | |||
73 | #ifdef CONFIG_SMP | 88 | #ifdef CONFIG_SMP |
89 | /* | ||
90 | * Note smp_send_stop is the usual smp shutdown function, which | ||
91 | * unfortunately means it may not be hardened to work in a panic | ||
92 | * situation. | ||
93 | */ | ||
74 | smp_send_stop(); | 94 | smp_send_stop(); |
75 | #endif | 95 | #endif |
76 | 96 | ||
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
79 | if (!panic_blink) | 99 | if (!panic_blink) |
80 | panic_blink = no_blink; | 100 | panic_blink = no_blink; |
81 | 101 | ||
82 | if (panic_timeout > 0) | 102 | if (panic_timeout > 0) { |
83 | { | ||
84 | /* | 103 | /* |
85 | * Delay timeout seconds before rebooting the machine. | 104 | * Delay timeout seconds before rebooting the machine. |
86 | * We can't use the "normal" timers since we just panicked.. | 105 | * We can't use the "normal" timers since we just panicked.. |
@@ -92,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...) | |||
92 | mdelay(1); | 111 | mdelay(1); |
93 | i++; | 112 | i++; |
94 | } | 113 | } |
95 | /* | 114 | /* This will not be a clean reboot, with everything |
96 | * Should we run the reboot notifier. For the moment Im | 115 | * shutting down. But if there is a chance of |
97 | * choosing not too. It might crash, be corrupt or do | 116 | * rebooting the system it will be rebooted. |
98 | * more harm than good for other reasons. | ||
99 | */ | 117 | */ |
100 | machine_restart(NULL); | 118 | emergency_restart(); |
101 | } | 119 | } |
102 | #ifdef __sparc__ | 120 | #ifdef __sparc__ |
103 | { | 121 | { |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index cabb63fc9e16..38798a2ff994 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -89,23 +89,6 @@ static struct idr posix_timers_id; | |||
89 | static DEFINE_SPINLOCK(idr_lock); | 89 | static DEFINE_SPINLOCK(idr_lock); |
90 | 90 | ||
91 | /* | 91 | /* |
92 | * Just because the timer is not in the timer list does NOT mean it is | ||
93 | * inactive. It could be in the "fire" routine getting a new expire time. | ||
94 | */ | ||
95 | #define TIMER_INACTIVE 1 | ||
96 | |||
97 | #ifdef CONFIG_SMP | ||
98 | # define timer_active(tmr) \ | ||
99 | ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE) | ||
100 | # define set_timer_inactive(tmr) \ | ||
101 | do { \ | ||
102 | (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \ | ||
103 | } while (0) | ||
104 | #else | ||
105 | # define timer_active(tmr) BARFY // error to use outside of SMP | ||
106 | # define set_timer_inactive(tmr) do { } while (0) | ||
107 | #endif | ||
108 | /* | ||
109 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other | 92 | * we assume that the new SIGEV_THREAD_ID shares no bits with the other |
110 | * SIGEV values. Here we put out an error if this assumption fails. | 93 | * SIGEV values. Here we put out an error if this assumption fails. |
111 | */ | 94 | */ |
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer) | |||
226 | init_timer(&new_timer->it.real.timer); | 209 | init_timer(&new_timer->it.real.timer); |
227 | new_timer->it.real.timer.data = (unsigned long) new_timer; | 210 | new_timer->it.real.timer.data = (unsigned long) new_timer; |
228 | new_timer->it.real.timer.function = posix_timer_fn; | 211 | new_timer->it.real.timer.function = posix_timer_fn; |
229 | set_timer_inactive(new_timer); | ||
230 | return 0; | 212 | return 0; |
231 | } | 213 | } |
232 | 214 | ||
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data) | |||
480 | int do_notify = 1; | 462 | int do_notify = 1; |
481 | 463 | ||
482 | spin_lock_irqsave(&timr->it_lock, flags); | 464 | spin_lock_irqsave(&timr->it_lock, flags); |
483 | set_timer_inactive(timr); | ||
484 | if (!list_empty(&timr->it.real.abs_timer_entry)) { | 465 | if (!list_empty(&timr->it.real.abs_timer_entry)) { |
485 | spin_lock(&abs_list.lock); | 466 | spin_lock(&abs_list.lock); |
486 | do { | 467 | do { |
@@ -915,21 +896,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, | |||
915 | jiffies_64_f = get_jiffies_64(); | 896 | jiffies_64_f = get_jiffies_64(); |
916 | } | 897 | } |
917 | /* | 898 | /* |
918 | * Take away now to get delta | 899 | * Take away now to get delta and normalize |
919 | */ | ||
920 | oc.tv_sec -= now.tv_sec; | ||
921 | oc.tv_nsec -= now.tv_nsec; | ||
922 | /* | ||
923 | * Normalize... | ||
924 | */ | 900 | */ |
925 | while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { | 901 | set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, |
926 | oc.tv_nsec -= NSEC_PER_SEC; | 902 | oc.tv_nsec - now.tv_nsec); |
927 | oc.tv_sec++; | ||
928 | } | ||
929 | while ((oc.tv_nsec) < 0) { | ||
930 | oc.tv_nsec += NSEC_PER_SEC; | ||
931 | oc.tv_sec--; | ||
932 | } | ||
933 | }else{ | 903 | }else{ |
934 | jiffies_64_f = get_jiffies_64(); | 904 | jiffies_64_f = get_jiffies_64(); |
935 | } | 905 | } |
@@ -983,8 +953,8 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
983 | * careful here. If smp we could be in the "fire" routine which will | 953 | * careful here. If smp we could be in the "fire" routine which will |
984 | * be spinning as we hold the lock. But this is ONLY an SMP issue. | 954 | * be spinning as we hold the lock. But this is ONLY an SMP issue. |
985 | */ | 955 | */ |
956 | if (try_to_del_timer_sync(&timr->it.real.timer) < 0) { | ||
986 | #ifdef CONFIG_SMP | 957 | #ifdef CONFIG_SMP |
987 | if (timer_active(timr) && !del_timer(&timr->it.real.timer)) | ||
988 | /* | 958 | /* |
989 | * It can only be active if on an other cpu. Since | 959 | * It can only be active if on an other cpu. Since |
990 | * we have cleared the interval stuff above, it should | 960 | * we have cleared the interval stuff above, it should |
@@ -994,11 +964,9 @@ common_timer_set(struct k_itimer *timr, int flags, | |||
994 | * a "retry" exit status. | 964 | * a "retry" exit status. |
995 | */ | 965 | */ |
996 | return TIMER_RETRY; | 966 | return TIMER_RETRY; |
997 | |||
998 | set_timer_inactive(timr); | ||
999 | #else | ||
1000 | del_timer(&timr->it.real.timer); | ||
1001 | #endif | 967 | #endif |
968 | } | ||
969 | |||
1002 | remove_from_abslist(timr); | 970 | remove_from_abslist(timr); |
1003 | 971 | ||
1004 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & | 972 | timr->it_requeue_pending = (timr->it_requeue_pending + 2) & |
@@ -1083,8 +1051,9 @@ retry: | |||
1083 | static inline int common_timer_del(struct k_itimer *timer) | 1051 | static inline int common_timer_del(struct k_itimer *timer) |
1084 | { | 1052 | { |
1085 | timer->it.real.incr = 0; | 1053 | timer->it.real.incr = 0; |
1054 | |||
1055 | if (try_to_del_timer_sync(&timer->it.real.timer) < 0) { | ||
1086 | #ifdef CONFIG_SMP | 1056 | #ifdef CONFIG_SMP |
1087 | if (timer_active(timer) && !del_timer(&timer->it.real.timer)) | ||
1088 | /* | 1057 | /* |
1089 | * It can only be active if on an other cpu. Since | 1058 | * It can only be active if on an other cpu. Since |
1090 | * we have cleared the interval stuff above, it should | 1059 | * we have cleared the interval stuff above, it should |
@@ -1094,9 +1063,9 @@ static inline int common_timer_del(struct k_itimer *timer) | |||
1094 | * a "retry" exit status. | 1063 | * a "retry" exit status. |
1095 | */ | 1064 | */ |
1096 | return TIMER_RETRY; | 1065 | return TIMER_RETRY; |
1097 | #else | ||
1098 | del_timer(&timer->it.real.timer); | ||
1099 | #endif | 1066 | #endif |
1067 | } | ||
1068 | |||
1100 | remove_from_abslist(timer); | 1069 | remove_from_abslist(timer); |
1101 | 1070 | ||
1102 | return 0; | 1071 | return 0; |
@@ -1197,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig) | |||
1197 | tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); | 1166 | tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); |
1198 | itimer_delete(tmr); | 1167 | itimer_delete(tmr); |
1199 | } | 1168 | } |
1200 | del_timer_sync(&sig->real_timer); | ||
1201 | } | 1169 | } |
1202 | 1170 | ||
1203 | /* | 1171 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 696387ffe49c..2c7121d9bff1 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -27,8 +27,8 @@ config PM_DEBUG | |||
27 | like suspend support. | 27 | like suspend support. |
28 | 28 | ||
29 | config SOFTWARE_SUSPEND | 29 | config SOFTWARE_SUSPEND |
30 | bool "Software Suspend (EXPERIMENTAL)" | 30 | bool "Software Suspend" |
31 | depends on EXPERIMENTAL && PM && SWAP | 31 | depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) |
32 | ---help--- | 32 | ---help--- |
33 | Enable the possibility of suspending the machine. | 33 | Enable the possibility of suspending the machine. |
34 | It doesn't need APM. | 34 | It doesn't need APM. |
@@ -72,3 +72,7 @@ config PM_STD_PARTITION | |||
72 | suspended image to. It will simply pick the first available swap | 72 | suspended image to. It will simply pick the first available swap |
73 | device. | 73 | device. |
74 | 74 | ||
75 | config SUSPEND_SMP | ||
76 | bool | ||
77 | depends on HOTPLUG_CPU && X86 && PM | ||
78 | default y | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index fbdc634135a7..2f438d0eaa13 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) | |||
3 | EXTRA_CFLAGS += -DDEBUG | 3 | EXTRA_CFLAGS += -DDEBUG |
4 | endif | 4 | endif |
5 | 5 | ||
6 | swsusp-smp-$(CONFIG_SMP) += smp.o | ||
7 | |||
8 | obj-y := main.o process.o console.o pm.o | 6 | obj-y := main.o process.o console.o pm.o |
9 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o | 7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o |
8 | |||
9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | ||
10 | 10 | ||
11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o | 11 | obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 02b6764034dc..664eb0469b6e 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/device.h> | 16 | #include <linux/device.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/mount.h> | ||
20 | |||
19 | #include "power.h" | 21 | #include "power.h" |
20 | 22 | ||
21 | 23 | ||
@@ -57,16 +59,13 @@ static void power_down(suspend_disk_method_t mode) | |||
57 | error = pm_ops->enter(PM_SUSPEND_DISK); | 59 | error = pm_ops->enter(PM_SUSPEND_DISK); |
58 | break; | 60 | break; |
59 | case PM_DISK_SHUTDOWN: | 61 | case PM_DISK_SHUTDOWN: |
60 | printk("Powering off system\n"); | 62 | kernel_power_off(); |
61 | device_shutdown(); | ||
62 | machine_power_off(); | ||
63 | break; | 63 | break; |
64 | case PM_DISK_REBOOT: | 64 | case PM_DISK_REBOOT: |
65 | device_shutdown(); | 65 | kernel_restart(NULL); |
66 | machine_restart(NULL); | ||
67 | break; | 66 | break; |
68 | } | 67 | } |
69 | machine_halt(); | 68 | kernel_halt(); |
70 | /* Valid image is on the disk, if we continue we risk serious data corruption | 69 | /* Valid image is on the disk, if we continue we risk serious data corruption |
71 | after resume. */ | 70 | after resume. */ |
72 | printk(KERN_CRIT "Please power me down manually\n"); | 71 | printk(KERN_CRIT "Please power me down manually\n"); |
@@ -117,8 +116,8 @@ static void finish(void) | |||
117 | { | 116 | { |
118 | device_resume(); | 117 | device_resume(); |
119 | platform_finish(); | 118 | platform_finish(); |
120 | enable_nonboot_cpus(); | ||
121 | thaw_processes(); | 119 | thaw_processes(); |
120 | enable_nonboot_cpus(); | ||
122 | pm_restore_console(); | 121 | pm_restore_console(); |
123 | } | 122 | } |
124 | 123 | ||
@@ -131,28 +130,35 @@ static int prepare_processes(void) | |||
131 | 130 | ||
132 | sys_sync(); | 131 | sys_sync(); |
133 | 132 | ||
133 | disable_nonboot_cpus(); | ||
134 | |||
134 | if (freeze_processes()) { | 135 | if (freeze_processes()) { |
135 | error = -EBUSY; | 136 | error = -EBUSY; |
136 | return error; | 137 | goto thaw; |
137 | } | 138 | } |
138 | 139 | ||
139 | if (pm_disk_mode == PM_DISK_PLATFORM) { | 140 | if (pm_disk_mode == PM_DISK_PLATFORM) { |
140 | if (pm_ops && pm_ops->prepare) { | 141 | if (pm_ops && pm_ops->prepare) { |
141 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) | 142 | if ((error = pm_ops->prepare(PM_SUSPEND_DISK))) |
142 | return error; | 143 | goto thaw; |
143 | } | 144 | } |
144 | } | 145 | } |
145 | 146 | ||
146 | /* Free memory before shutting down devices. */ | 147 | /* Free memory before shutting down devices. */ |
147 | free_some_memory(); | 148 | free_some_memory(); |
148 | |||
149 | return 0; | 149 | return 0; |
150 | thaw: | ||
151 | thaw_processes(); | ||
152 | enable_nonboot_cpus(); | ||
153 | pm_restore_console(); | ||
154 | return error; | ||
150 | } | 155 | } |
151 | 156 | ||
152 | static void unprepare_processes(void) | 157 | static void unprepare_processes(void) |
153 | { | 158 | { |
154 | enable_nonboot_cpus(); | 159 | platform_finish(); |
155 | thaw_processes(); | 160 | thaw_processes(); |
161 | enable_nonboot_cpus(); | ||
156 | pm_restore_console(); | 162 | pm_restore_console(); |
157 | } | 163 | } |
158 | 164 | ||
@@ -160,15 +166,9 @@ static int prepare_devices(void) | |||
160 | { | 166 | { |
161 | int error; | 167 | int error; |
162 | 168 | ||
163 | disable_nonboot_cpus(); | 169 | if ((error = device_suspend(PMSG_FREEZE))) |
164 | if ((error = device_suspend(PMSG_FREEZE))) { | ||
165 | printk("Some devices failed to suspend\n"); | 170 | printk("Some devices failed to suspend\n"); |
166 | platform_finish(); | 171 | return error; |
167 | enable_nonboot_cpus(); | ||
168 | return error; | ||
169 | } | ||
170 | |||
171 | return 0; | ||
172 | } | 172 | } |
173 | 173 | ||
174 | /** | 174 | /** |
@@ -185,9 +185,9 @@ int pm_suspend_disk(void) | |||
185 | int error; | 185 | int error; |
186 | 186 | ||
187 | error = prepare_processes(); | 187 | error = prepare_processes(); |
188 | if (!error) { | 188 | if (error) |
189 | error = prepare_devices(); | 189 | return error; |
190 | } | 190 | error = prepare_devices(); |
191 | 191 | ||
192 | if (error) { | 192 | if (error) { |
193 | unprepare_processes(); | 193 | unprepare_processes(); |
@@ -233,6 +233,16 @@ static int software_resume(void) | |||
233 | { | 233 | { |
234 | int error; | 234 | int error; |
235 | 235 | ||
236 | if (!swsusp_resume_device) { | ||
237 | if (!strlen(resume_file)) | ||
238 | return -ENOENT; | ||
239 | swsusp_resume_device = name_to_dev_t(resume_file); | ||
240 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); | ||
241 | } else { | ||
242 | pr_debug("swsusp: Resume From Partition %d:%d\n", | ||
243 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | ||
244 | } | ||
245 | |||
236 | if (noresume) { | 246 | if (noresume) { |
237 | /** | 247 | /** |
238 | * FIXME: If noresume is specified, we need to find the partition | 248 | * FIXME: If noresume is specified, we need to find the partition |
@@ -250,7 +260,7 @@ static int software_resume(void) | |||
250 | 260 | ||
251 | if ((error = prepare_processes())) { | 261 | if ((error = prepare_processes())) { |
252 | swsusp_close(); | 262 | swsusp_close(); |
253 | goto Cleanup; | 263 | goto Done; |
254 | } | 264 | } |
255 | 265 | ||
256 | pr_debug("PM: Reading swsusp image.\n"); | 266 | pr_debug("PM: Reading swsusp image.\n"); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 4cdebc972ff2..71aa0fd22007 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -19,6 +19,9 @@ | |||
19 | 19 | ||
20 | #include "power.h" | 20 | #include "power.h" |
21 | 21 | ||
22 | /*This is just an arbitrary number */ | ||
23 | #define FREE_PAGE_NUMBER (100) | ||
24 | |||
22 | DECLARE_MUTEX(pm_sem); | 25 | DECLARE_MUTEX(pm_sem); |
23 | 26 | ||
24 | struct pm_ops * pm_ops = NULL; | 27 | struct pm_ops * pm_ops = NULL; |
@@ -49,17 +52,35 @@ void pm_set_ops(struct pm_ops * ops) | |||
49 | static int suspend_prepare(suspend_state_t state) | 52 | static int suspend_prepare(suspend_state_t state) |
50 | { | 53 | { |
51 | int error = 0; | 54 | int error = 0; |
55 | unsigned int free_pages; | ||
52 | 56 | ||
53 | if (!pm_ops || !pm_ops->enter) | 57 | if (!pm_ops || !pm_ops->enter) |
54 | return -EPERM; | 58 | return -EPERM; |
55 | 59 | ||
56 | pm_prepare_console(); | 60 | pm_prepare_console(); |
57 | 61 | ||
62 | disable_nonboot_cpus(); | ||
63 | |||
64 | if (num_online_cpus() != 1) { | ||
65 | error = -EPERM; | ||
66 | goto Enable_cpu; | ||
67 | } | ||
68 | |||
58 | if (freeze_processes()) { | 69 | if (freeze_processes()) { |
59 | error = -EAGAIN; | 70 | error = -EAGAIN; |
60 | goto Thaw; | 71 | goto Thaw; |
61 | } | 72 | } |
62 | 73 | ||
74 | if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { | ||
75 | pr_debug("PM: free some memory\n"); | ||
76 | shrink_all_memory(FREE_PAGE_NUMBER - free_pages); | ||
77 | if (nr_free_pages() < FREE_PAGE_NUMBER) { | ||
78 | error = -ENOMEM; | ||
79 | printk(KERN_ERR "PM: No enough memory\n"); | ||
80 | goto Thaw; | ||
81 | } | ||
82 | } | ||
83 | |||
63 | if (pm_ops->prepare) { | 84 | if (pm_ops->prepare) { |
64 | if ((error = pm_ops->prepare(state))) | 85 | if ((error = pm_ops->prepare(state))) |
65 | goto Thaw; | 86 | goto Thaw; |
@@ -75,6 +96,8 @@ static int suspend_prepare(suspend_state_t state) | |||
75 | pm_ops->finish(state); | 96 | pm_ops->finish(state); |
76 | Thaw: | 97 | Thaw: |
77 | thaw_processes(); | 98 | thaw_processes(); |
99 | Enable_cpu: | ||
100 | enable_nonboot_cpus(); | ||
78 | pm_restore_console(); | 101 | pm_restore_console(); |
79 | return error; | 102 | return error; |
80 | } | 103 | } |
@@ -113,6 +136,7 @@ static void suspend_finish(suspend_state_t state) | |||
113 | if (pm_ops && pm_ops->finish) | 136 | if (pm_ops && pm_ops->finish) |
114 | pm_ops->finish(state); | 137 | pm_ops->finish(state); |
115 | thaw_processes(); | 138 | thaw_processes(); |
139 | enable_nonboot_cpus(); | ||
116 | pm_restore_console(); | 140 | pm_restore_console(); |
117 | } | 141 | } |
118 | 142 | ||
@@ -150,12 +174,6 @@ static int enter_state(suspend_state_t state) | |||
150 | goto Unlock; | 174 | goto Unlock; |
151 | } | 175 | } |
152 | 176 | ||
153 | /* Suspend is hard to get right on SMP. */ | ||
154 | if (num_online_cpus() != 1) { | ||
155 | error = -EPERM; | ||
156 | goto Unlock; | ||
157 | } | ||
158 | |||
159 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 177 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
160 | if ((error = suspend_prepare(state))) | 178 | if ((error = suspend_prepare(state))) |
161 | goto Unlock; | 179 | goto Unlock; |
@@ -190,7 +208,7 @@ int software_suspend(void) | |||
190 | 208 | ||
191 | int pm_suspend(suspend_state_t state) | 209 | int pm_suspend(suspend_state_t state) |
192 | { | 210 | { |
193 | if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) | 211 | if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) |
194 | return enter_state(state); | 212 | return enter_state(state); |
195 | return -EINVAL; | 213 | return -EINVAL; |
196 | } | 214 | } |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 715081b2d829..7a4144ba3afd 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/pm.h> | 10 | #include <linux/pm.h> |
11 | #include <linux/workqueue.h> | 11 | #include <linux/workqueue.h> |
12 | #include <linux/reboot.h> | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * When the user hits Sys-Rq o to power down the machine this is the | 15 | * When the user hits Sys-Rq o to power down the machine this is the |
@@ -17,8 +18,7 @@ | |||
17 | 18 | ||
18 | static void do_poweroff(void *dummy) | 19 | static void do_poweroff(void *dummy) |
19 | { | 20 | { |
20 | if (pm_power_off) | 21 | kernel_power_off(); |
21 | pm_power_off(); | ||
22 | } | 22 | } |
23 | 23 | ||
24 | static DECLARE_WORK(poweroff_work, do_poweroff, NULL); | 24 | static DECLARE_WORK(poweroff_work, do_poweroff, NULL); |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 78d92dc6a1ed..3bd0d261818f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p) | |||
32 | } | 32 | } |
33 | 33 | ||
34 | /* Refrigerator is place where frozen processes are stored :-). */ | 34 | /* Refrigerator is place where frozen processes are stored :-). */ |
35 | void refrigerator(unsigned long flag) | 35 | void refrigerator(void) |
36 | { | 36 | { |
37 | /* Hmm, should we be allowed to suspend when there are realtime | 37 | /* Hmm, should we be allowed to suspend when there are realtime |
38 | processes around? */ | 38 | processes around? */ |
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag) | |||
41 | current->state = TASK_UNINTERRUPTIBLE; | 41 | current->state = TASK_UNINTERRUPTIBLE; |
42 | pr_debug("%s entered refrigerator\n", current->comm); | 42 | pr_debug("%s entered refrigerator\n", current->comm); |
43 | printk("="); | 43 | printk("="); |
44 | current->flags &= ~PF_FREEZE; | ||
45 | 44 | ||
45 | frozen_process(current); | ||
46 | spin_lock_irq(¤t->sighand->siglock); | 46 | spin_lock_irq(¤t->sighand->siglock); |
47 | recalc_sigpending(); /* We sent fake signal, clean it up */ | 47 | recalc_sigpending(); /* We sent fake signal, clean it up */ |
48 | spin_unlock_irq(¤t->sighand->siglock); | 48 | spin_unlock_irq(¤t->sighand->siglock); |
49 | 49 | ||
50 | current->flags |= PF_FROZEN; | 50 | while (frozen(current)) |
51 | while (current->flags & PF_FROZEN) | ||
52 | schedule(); | 51 | schedule(); |
53 | pr_debug("%s left refrigerator\n", current->comm); | 52 | pr_debug("%s left refrigerator\n", current->comm); |
54 | current->state = save; | 53 | current->state = save; |
@@ -57,27 +56,23 @@ void refrigerator(unsigned long flag) | |||
57 | /* 0 = success, else # of processes that we failed to stop */ | 56 | /* 0 = success, else # of processes that we failed to stop */ |
58 | int freeze_processes(void) | 57 | int freeze_processes(void) |
59 | { | 58 | { |
60 | int todo; | 59 | int todo; |
61 | unsigned long start_time; | 60 | unsigned long start_time; |
62 | struct task_struct *g, *p; | 61 | struct task_struct *g, *p; |
63 | 62 | unsigned long flags; | |
63 | |||
64 | printk( "Stopping tasks: " ); | 64 | printk( "Stopping tasks: " ); |
65 | start_time = jiffies; | 65 | start_time = jiffies; |
66 | do { | 66 | do { |
67 | todo = 0; | 67 | todo = 0; |
68 | read_lock(&tasklist_lock); | 68 | read_lock(&tasklist_lock); |
69 | do_each_thread(g, p) { | 69 | do_each_thread(g, p) { |
70 | unsigned long flags; | ||
71 | if (!freezeable(p)) | 70 | if (!freezeable(p)) |
72 | continue; | 71 | continue; |
73 | if ((p->flags & PF_FROZEN) || | 72 | if (frozen(p)) |
74 | (p->state == TASK_TRACED) || | ||
75 | (p->state == TASK_STOPPED)) | ||
76 | continue; | 73 | continue; |
77 | 74 | ||
78 | /* FIXME: smp problem here: we may not access other process' flags | 75 | freeze(p); |
79 | without locking */ | ||
80 | p->flags |= PF_FREEZE; | ||
81 | spin_lock_irqsave(&p->sighand->siglock, flags); | 76 | spin_lock_irqsave(&p->sighand->siglock, flags); |
82 | signal_wake_up(p, 0); | 77 | signal_wake_up(p, 0); |
83 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 78 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
@@ -91,7 +86,7 @@ int freeze_processes(void) | |||
91 | return todo; | 86 | return todo; |
92 | } | 87 | } |
93 | } while(todo); | 88 | } while(todo); |
94 | 89 | ||
95 | printk( "|\n" ); | 90 | printk( "|\n" ); |
96 | BUG_ON(in_atomic()); | 91 | BUG_ON(in_atomic()); |
97 | return 0; | 92 | return 0; |
@@ -106,10 +101,7 @@ void thaw_processes(void) | |||
106 | do_each_thread(g, p) { | 101 | do_each_thread(g, p) { |
107 | if (!freezeable(p)) | 102 | if (!freezeable(p)) |
108 | continue; | 103 | continue; |
109 | if (p->flags & PF_FROZEN) { | 104 | if (!thaw_process(p)) |
110 | p->flags &= ~PF_FROZEN; | ||
111 | wake_up_process(p); | ||
112 | } else | ||
113 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); | 105 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); |
114 | } while_each_thread(g, p); | 106 | } while_each_thread(g, p); |
115 | 107 | ||
diff --git a/kernel/power/smp.c b/kernel/power/smp.c index 457c2302ed42..911fc62b8225 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c | |||
@@ -13,73 +13,52 @@ | |||
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/suspend.h> | 14 | #include <linux/suspend.h> |
15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
16 | #include <linux/cpu.h> | ||
16 | #include <asm/atomic.h> | 17 | #include <asm/atomic.h> |
17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
18 | 19 | ||
19 | static atomic_t cpu_counter, freeze; | 20 | /* This is protected by pm_sem semaphore */ |
20 | 21 | static cpumask_t frozen_cpus; | |
21 | |||
22 | static void smp_pause(void * data) | ||
23 | { | ||
24 | struct saved_context ctxt; | ||
25 | __save_processor_state(&ctxt); | ||
26 | printk("Sleeping in:\n"); | ||
27 | dump_stack(); | ||
28 | atomic_inc(&cpu_counter); | ||
29 | while (atomic_read(&freeze)) { | ||
30 | /* FIXME: restore takes place at random piece inside this. | ||
31 | This should probably be written in assembly, and | ||
32 | preserve general-purpose registers, too | ||
33 | |||
34 | What about stack? We may need to move to new stack here. | ||
35 | |||
36 | This should better be ran with interrupts disabled. | ||
37 | */ | ||
38 | cpu_relax(); | ||
39 | barrier(); | ||
40 | } | ||
41 | atomic_dec(&cpu_counter); | ||
42 | __restore_processor_state(&ctxt); | ||
43 | } | ||
44 | |||
45 | static cpumask_t oldmask; | ||
46 | 22 | ||
47 | void disable_nonboot_cpus(void) | 23 | void disable_nonboot_cpus(void) |
48 | { | 24 | { |
49 | oldmask = current->cpus_allowed; | 25 | int cpu, error; |
50 | set_cpus_allowed(current, cpumask_of_cpu(0)); | ||
51 | printk("Freezing CPUs (at %d)", raw_smp_processor_id()); | ||
52 | current->state = TASK_INTERRUPTIBLE; | ||
53 | schedule_timeout(HZ); | ||
54 | printk("..."); | ||
55 | BUG_ON(raw_smp_processor_id() != 0); | ||
56 | 26 | ||
57 | /* FIXME: for this to work, all the CPUs must be running | 27 | error = 0; |
58 | * "idle" thread (or we deadlock). Is that guaranteed? */ | 28 | cpus_clear(frozen_cpus); |
59 | 29 | printk("Freezing cpus ...\n"); | |
60 | atomic_set(&cpu_counter, 0); | 30 | for_each_online_cpu(cpu) { |
61 | atomic_set(&freeze, 1); | 31 | if (cpu == 0) |
62 | smp_call_function(smp_pause, NULL, 0, 0); | 32 | continue; |
63 | while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) { | 33 | error = cpu_down(cpu); |
64 | cpu_relax(); | 34 | if (!error) { |
65 | barrier(); | 35 | cpu_set(cpu, frozen_cpus); |
36 | printk("CPU%d is down\n", cpu); | ||
37 | continue; | ||
38 | } | ||
39 | printk("Error taking cpu %d down: %d\n", cpu, error); | ||
66 | } | 40 | } |
67 | printk("ok\n"); | 41 | BUG_ON(raw_smp_processor_id() != 0); |
42 | if (error) | ||
43 | panic("cpus not sleeping"); | ||
68 | } | 44 | } |
69 | 45 | ||
70 | void enable_nonboot_cpus(void) | 46 | void enable_nonboot_cpus(void) |
71 | { | 47 | { |
72 | printk("Restarting CPUs"); | 48 | int cpu, error; |
73 | atomic_set(&freeze, 0); | ||
74 | while (atomic_read(&cpu_counter)) { | ||
75 | cpu_relax(); | ||
76 | barrier(); | ||
77 | } | ||
78 | printk("..."); | ||
79 | set_cpus_allowed(current, oldmask); | ||
80 | schedule(); | ||
81 | printk("ok\n"); | ||
82 | 49 | ||
50 | printk("Thawing cpus ...\n"); | ||
51 | for_each_cpu_mask(cpu, frozen_cpus) { | ||
52 | error = smp_prepare_cpu(cpu); | ||
53 | if (!error) | ||
54 | error = cpu_up(cpu); | ||
55 | if (!error) { | ||
56 | printk("CPU%d is up\n", cpu); | ||
57 | continue; | ||
58 | } | ||
59 | printk("Error taking cpu %d up: %d\n", cpu, error); | ||
60 | panic("Not enough cpus"); | ||
61 | } | ||
62 | cpus_clear(frozen_cpus); | ||
83 | } | 63 | } |
84 | 64 | ||
85 | |||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 90b3b68dee3f..f2bc71b9fe8b 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -10,12 +10,12 @@ | |||
10 | * This file is released under the GPLv2. | 10 | * This file is released under the GPLv2. |
11 | * | 11 | * |
12 | * I'd like to thank the following people for their work: | 12 | * I'd like to thank the following people for their work: |
13 | * | 13 | * |
14 | * Pavel Machek <pavel@ucw.cz>: | 14 | * Pavel Machek <pavel@ucw.cz>: |
15 | * Modifications, defectiveness pointing, being with me at the very beginning, | 15 | * Modifications, defectiveness pointing, being with me at the very beginning, |
16 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. | 16 | * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. |
17 | * | 17 | * |
18 | * Steve Doddi <dirk@loth.demon.co.uk>: | 18 | * Steve Doddi <dirk@loth.demon.co.uk>: |
19 | * Support the possibility of hardware state restoring. | 19 | * Support the possibility of hardware state restoring. |
20 | * | 20 | * |
21 | * Raph <grey.havens@earthling.net>: | 21 | * Raph <grey.havens@earthling.net>: |
@@ -63,6 +63,7 @@ | |||
63 | #include <linux/console.h> | 63 | #include <linux/console.h> |
64 | #include <linux/highmem.h> | 64 | #include <linux/highmem.h> |
65 | #include <linux/bio.h> | 65 | #include <linux/bio.h> |
66 | #include <linux/mount.h> | ||
66 | 67 | ||
67 | #include <asm/uaccess.h> | 68 | #include <asm/uaccess.h> |
68 | #include <asm/mmu_context.h> | 69 | #include <asm/mmu_context.h> |
@@ -81,14 +82,14 @@ static int nr_copy_pages_check; | |||
81 | extern char resume_file[]; | 82 | extern char resume_file[]; |
82 | 83 | ||
83 | /* Local variables that should not be affected by save */ | 84 | /* Local variables that should not be affected by save */ |
84 | unsigned int nr_copy_pages __nosavedata = 0; | 85 | static unsigned int nr_copy_pages __nosavedata = 0; |
85 | 86 | ||
86 | /* Suspend pagedir is allocated before final copy, therefore it | 87 | /* Suspend pagedir is allocated before final copy, therefore it |
87 | must be freed after resume | 88 | must be freed after resume |
88 | 89 | ||
89 | Warning: this is evil. There are actually two pagedirs at time of | 90 | Warning: this is evil. There are actually two pagedirs at time of |
90 | resume. One is "pagedir_save", which is empty frame allocated at | 91 | resume. One is "pagedir_save", which is empty frame allocated at |
91 | time of suspend, that must be freed. Second is "pagedir_nosave", | 92 | time of suspend, that must be freed. Second is "pagedir_nosave", |
92 | allocated at time of resume, that travels through memory not to | 93 | allocated at time of resume, that travels through memory not to |
93 | collide with anything. | 94 | collide with anything. |
94 | 95 | ||
@@ -132,7 +133,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
132 | { | 133 | { |
133 | int error; | 134 | int error; |
134 | 135 | ||
135 | rw_swap_page_sync(READ, | 136 | rw_swap_page_sync(READ, |
136 | swp_entry(root_swap, 0), | 137 | swp_entry(root_swap, 0), |
137 | virt_to_page((unsigned long)&swsusp_header)); | 138 | virt_to_page((unsigned long)&swsusp_header)); |
138 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | 139 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || |
@@ -140,7 +141,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
140 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 141 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
141 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 142 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
142 | swsusp_header.swsusp_info = prev; | 143 | swsusp_header.swsusp_info = prev; |
143 | error = rw_swap_page_sync(WRITE, | 144 | error = rw_swap_page_sync(WRITE, |
144 | swp_entry(root_swap, 0), | 145 | swp_entry(root_swap, 0), |
145 | virt_to_page((unsigned long) | 146 | virt_to_page((unsigned long) |
146 | &swsusp_header)); | 147 | &swsusp_header)); |
@@ -174,22 +175,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) | |||
174 | static int swsusp_swap_check(void) /* This is called before saving image */ | 175 | static int swsusp_swap_check(void) /* This is called before saving image */ |
175 | { | 176 | { |
176 | int i, len; | 177 | int i, len; |
177 | 178 | ||
178 | len=strlen(resume_file); | 179 | len=strlen(resume_file); |
179 | root_swap = 0xFFFF; | 180 | root_swap = 0xFFFF; |
180 | 181 | ||
181 | swap_list_lock(); | 182 | swap_list_lock(); |
182 | for(i=0; i<MAX_SWAPFILES; i++) { | 183 | for (i=0; i<MAX_SWAPFILES; i++) { |
183 | if (swap_info[i].flags == 0) { | 184 | if (swap_info[i].flags == 0) { |
184 | swapfile_used[i]=SWAPFILE_UNUSED; | 185 | swapfile_used[i]=SWAPFILE_UNUSED; |
185 | } else { | 186 | } else { |
186 | if(!len) { | 187 | if (!len) { |
187 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); | 188 | printk(KERN_WARNING "resume= option should be used to set suspend device" ); |
188 | if(root_swap == 0xFFFF) { | 189 | if (root_swap == 0xFFFF) { |
189 | swapfile_used[i] = SWAPFILE_SUSPEND; | 190 | swapfile_used[i] = SWAPFILE_SUSPEND; |
190 | root_swap = i; | 191 | root_swap = i; |
191 | } else | 192 | } else |
192 | swapfile_used[i] = SWAPFILE_IGNORED; | 193 | swapfile_used[i] = SWAPFILE_IGNORED; |
193 | } else { | 194 | } else { |
194 | /* we ignore all swap devices that are not the resume_file */ | 195 | /* we ignore all swap devices that are not the resume_file */ |
195 | if (is_resume_device(&swap_info[i])) { | 196 | if (is_resume_device(&swap_info[i])) { |
@@ -209,15 +210,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
209 | * This is called after saving image so modification | 210 | * This is called after saving image so modification |
210 | * will be lost after resume... and that's what we want. | 211 | * will be lost after resume... and that's what we want. |
211 | * we make the device unusable. A new call to | 212 | * we make the device unusable. A new call to |
212 | * lock_swapdevices can unlock the devices. | 213 | * lock_swapdevices can unlock the devices. |
213 | */ | 214 | */ |
214 | static void lock_swapdevices(void) | 215 | static void lock_swapdevices(void) |
215 | { | 216 | { |
216 | int i; | 217 | int i; |
217 | 218 | ||
218 | swap_list_lock(); | 219 | swap_list_lock(); |
219 | for(i = 0; i< MAX_SWAPFILES; i++) | 220 | for (i = 0; i< MAX_SWAPFILES; i++) |
220 | if(swapfile_used[i] == SWAPFILE_IGNORED) { | 221 | if (swapfile_used[i] == SWAPFILE_IGNORED) { |
221 | swap_info[i].flags ^= 0xFF; | 222 | swap_info[i].flags ^= 0xFF; |
222 | } | 223 | } |
223 | swap_list_unlock(); | 224 | swap_list_unlock(); |
@@ -229,7 +230,7 @@ static void lock_swapdevices(void) | |||
229 | * @loc: Place to store the entry we used. | 230 | * @loc: Place to store the entry we used. |
230 | * | 231 | * |
231 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO | 232 | * Allocate a new swap entry and 'sync' it. Note we discard -EIO |
232 | * errors. That is an artifact left over from swsusp. It did not | 233 | * errors. That is an artifact left over from swsusp. It did not |
233 | * check the return of rw_swap_page_sync() at all, since most pages | 234 | * check the return of rw_swap_page_sync() at all, since most pages |
234 | * written back to swap would return -EIO. | 235 | * written back to swap would return -EIO. |
235 | * This is a partial improvement, since we will at least return other | 236 | * This is a partial improvement, since we will at least return other |
@@ -241,7 +242,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
241 | int error = 0; | 242 | int error = 0; |
242 | 243 | ||
243 | entry = get_swap_page(); | 244 | entry = get_swap_page(); |
244 | if (swp_offset(entry) && | 245 | if (swp_offset(entry) && |
245 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { | 246 | swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { |
246 | error = rw_swap_page_sync(WRITE, entry, | 247 | error = rw_swap_page_sync(WRITE, entry, |
247 | virt_to_page(addr)); | 248 | virt_to_page(addr)); |
@@ -257,7 +258,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
257 | /** | 258 | /** |
258 | * data_free - Free the swap entries used by the saved image. | 259 | * data_free - Free the swap entries used by the saved image. |
259 | * | 260 | * |
260 | * Walk the list of used swap entries and free each one. | 261 | * Walk the list of used swap entries and free each one. |
261 | * This is only used for cleanup when suspend fails. | 262 | * This is only used for cleanup when suspend fails. |
262 | */ | 263 | */ |
263 | static void data_free(void) | 264 | static void data_free(void) |
@@ -290,7 +291,7 @@ static int data_write(void) | |||
290 | mod = 1; | 291 | mod = 1; |
291 | 292 | ||
292 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); | 293 | printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); |
293 | for_each_pbe(p, pagedir_nosave) { | 294 | for_each_pbe (p, pagedir_nosave) { |
294 | if (!(i%mod)) | 295 | if (!(i%mod)) |
295 | printk( "\b\b\b\b%3d%%", i / mod ); | 296 | printk( "\b\b\b\b%3d%%", i / mod ); |
296 | if ((error = write_page(p->address, &(p->swap_address)))) | 297 | if ((error = write_page(p->address, &(p->swap_address)))) |
@@ -335,7 +336,7 @@ static int close_swap(void) | |||
335 | 336 | ||
336 | dump_info(); | 337 | dump_info(); |
337 | error = write_page((unsigned long)&swsusp_info, &entry); | 338 | error = write_page((unsigned long)&swsusp_info, &entry); |
338 | if (!error) { | 339 | if (!error) { |
339 | printk( "S" ); | 340 | printk( "S" ); |
340 | error = mark_swapfiles(entry); | 341 | error = mark_swapfiles(entry); |
341 | printk( "|\n" ); | 342 | printk( "|\n" ); |
@@ -370,7 +371,7 @@ static int write_pagedir(void) | |||
370 | struct pbe * pbe; | 371 | struct pbe * pbe; |
371 | 372 | ||
372 | printk( "Writing pagedir..."); | 373 | printk( "Writing pagedir..."); |
373 | for_each_pb_page(pbe, pagedir_nosave) { | 374 | for_each_pb_page (pbe, pagedir_nosave) { |
374 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) | 375 | if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) |
375 | return error; | 376 | return error; |
376 | } | 377 | } |
@@ -472,7 +473,7 @@ static int save_highmem(void) | |||
472 | int res = 0; | 473 | int res = 0; |
473 | 474 | ||
474 | pr_debug("swsusp: Saving Highmem\n"); | 475 | pr_debug("swsusp: Saving Highmem\n"); |
475 | for_each_zone(zone) { | 476 | for_each_zone (zone) { |
476 | if (is_highmem(zone)) | 477 | if (is_highmem(zone)) |
477 | res = save_highmem_zone(zone); | 478 | res = save_highmem_zone(zone); |
478 | if (res) | 479 | if (res) |
@@ -547,7 +548,7 @@ static void count_data_pages(void) | |||
547 | 548 | ||
548 | nr_copy_pages = 0; | 549 | nr_copy_pages = 0; |
549 | 550 | ||
550 | for_each_zone(zone) { | 551 | for_each_zone (zone) { |
551 | if (is_highmem(zone)) | 552 | if (is_highmem(zone)) |
552 | continue; | 553 | continue; |
553 | mark_free_pages(zone); | 554 | mark_free_pages(zone); |
@@ -562,9 +563,9 @@ static void copy_data_pages(void) | |||
562 | struct zone *zone; | 563 | struct zone *zone; |
563 | unsigned long zone_pfn; | 564 | unsigned long zone_pfn; |
564 | struct pbe * pbe = pagedir_nosave; | 565 | struct pbe * pbe = pagedir_nosave; |
565 | 566 | ||
566 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | 567 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); |
567 | for_each_zone(zone) { | 568 | for_each_zone (zone) { |
568 | if (is_highmem(zone)) | 569 | if (is_highmem(zone)) |
569 | continue; | 570 | continue; |
570 | mark_free_pages(zone); | 571 | mark_free_pages(zone); |
@@ -702,7 +703,7 @@ static void free_image_pages(void) | |||
702 | { | 703 | { |
703 | struct pbe * p; | 704 | struct pbe * p; |
704 | 705 | ||
705 | for_each_pbe(p, pagedir_save) { | 706 | for_each_pbe (p, pagedir_save) { |
706 | if (p->address) { | 707 | if (p->address) { |
707 | ClearPageNosave(virt_to_page(p->address)); | 708 | ClearPageNosave(virt_to_page(p->address)); |
708 | free_page(p->address); | 709 | free_page(p->address); |
@@ -719,7 +720,7 @@ static int alloc_image_pages(void) | |||
719 | { | 720 | { |
720 | struct pbe * p; | 721 | struct pbe * p; |
721 | 722 | ||
722 | for_each_pbe(p, pagedir_save) { | 723 | for_each_pbe (p, pagedir_save) { |
723 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | 724 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); |
724 | if (!p->address) | 725 | if (!p->address) |
725 | return -ENOMEM; | 726 | return -ENOMEM; |
@@ -740,7 +741,7 @@ void swsusp_free(void) | |||
740 | /** | 741 | /** |
741 | * enough_free_mem - Make sure we enough free memory to snapshot. | 742 | * enough_free_mem - Make sure we enough free memory to snapshot. |
742 | * | 743 | * |
743 | * Returns TRUE or FALSE after checking the number of available | 744 | * Returns TRUE or FALSE after checking the number of available |
744 | * free pages. | 745 | * free pages. |
745 | */ | 746 | */ |
746 | 747 | ||
@@ -758,11 +759,11 @@ static int enough_free_mem(void) | |||
758 | /** | 759 | /** |
759 | * enough_swap - Make sure we have enough swap to save the image. | 760 | * enough_swap - Make sure we have enough swap to save the image. |
760 | * | 761 | * |
761 | * Returns TRUE or FALSE after checking the total amount of swap | 762 | * Returns TRUE or FALSE after checking the total amount of swap |
762 | * space avaiable. | 763 | * space avaiable. |
763 | * | 764 | * |
764 | * FIXME: si_swapinfo(&i) returns all swap devices information. | 765 | * FIXME: si_swapinfo(&i) returns all swap devices information. |
765 | * We should only consider resume_device. | 766 | * We should only consider resume_device. |
766 | */ | 767 | */ |
767 | 768 | ||
768 | static int enough_swap(void) | 769 | static int enough_swap(void) |
@@ -781,18 +782,18 @@ static int swsusp_alloc(void) | |||
781 | { | 782 | { |
782 | int error; | 783 | int error; |
783 | 784 | ||
785 | pagedir_nosave = NULL; | ||
786 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
787 | |||
784 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | 788 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", |
785 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | 789 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); |
786 | 790 | ||
787 | pagedir_nosave = NULL; | ||
788 | if (!enough_free_mem()) | 791 | if (!enough_free_mem()) |
789 | return -ENOMEM; | 792 | return -ENOMEM; |
790 | 793 | ||
791 | if (!enough_swap()) | 794 | if (!enough_swap()) |
792 | return -ENOSPC; | 795 | return -ENOSPC; |
793 | 796 | ||
794 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
795 | |||
796 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | 797 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { |
797 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | 798 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); |
798 | return -ENOMEM; | 799 | return -ENOMEM; |
@@ -827,8 +828,8 @@ static int suspend_prepare_image(void) | |||
827 | error = swsusp_alloc(); | 828 | error = swsusp_alloc(); |
828 | if (error) | 829 | if (error) |
829 | return error; | 830 | return error; |
830 | 831 | ||
831 | /* During allocating of suspend pagedir, new cold pages may appear. | 832 | /* During allocating of suspend pagedir, new cold pages may appear. |
832 | * Kill them. | 833 | * Kill them. |
833 | */ | 834 | */ |
834 | drain_local_pages(); | 835 | drain_local_pages(); |
@@ -869,13 +870,6 @@ extern asmlinkage int swsusp_arch_resume(void); | |||
869 | 870 | ||
870 | asmlinkage int swsusp_save(void) | 871 | asmlinkage int swsusp_save(void) |
871 | { | 872 | { |
872 | int error = 0; | ||
873 | |||
874 | if ((error = swsusp_swap_check())) { | ||
875 | printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " | ||
876 | "swapon -a!\n"); | ||
877 | return error; | ||
878 | } | ||
879 | return suspend_prepare_image(); | 873 | return suspend_prepare_image(); |
880 | } | 874 | } |
881 | 875 | ||
@@ -892,14 +886,20 @@ int swsusp_suspend(void) | |||
892 | * at resume time, and evil weirdness ensues. | 886 | * at resume time, and evil weirdness ensues. |
893 | */ | 887 | */ |
894 | if ((error = device_power_down(PMSG_FREEZE))) { | 888 | if ((error = device_power_down(PMSG_FREEZE))) { |
895 | printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); | ||
896 | local_irq_enable(); | 889 | local_irq_enable(); |
897 | swsusp_free(); | ||
898 | return error; | 890 | return error; |
899 | } | 891 | } |
892 | |||
893 | if ((error = swsusp_swap_check())) { | ||
894 | printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " | ||
895 | "swapon -a!\n"); | ||
896 | local_irq_enable(); | ||
897 | return error; | ||
898 | } | ||
899 | |||
900 | save_processor_state(); | 900 | save_processor_state(); |
901 | if ((error = swsusp_arch_suspend())) | 901 | if ((error = swsusp_arch_suspend())) |
902 | swsusp_free(); | 902 | printk("Error %d suspending\n", error); |
903 | /* Restore control flow magically appears here */ | 903 | /* Restore control flow magically appears here */ |
904 | restore_processor_state(); | 904 | restore_processor_state(); |
905 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | 905 | BUG_ON (nr_copy_pages_check != nr_copy_pages); |
@@ -929,21 +929,6 @@ int swsusp_resume(void) | |||
929 | return error; | 929 | return error; |
930 | } | 930 | } |
931 | 931 | ||
932 | /* More restore stuff */ | ||
933 | |||
934 | /* | ||
935 | * Returns true if given address/order collides with any orig_address | ||
936 | */ | ||
937 | static int does_collide_order(unsigned long addr, int order) | ||
938 | { | ||
939 | int i; | ||
940 | |||
941 | for (i=0; i < (1<<order); i++) | ||
942 | if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE))) | ||
943 | return 1; | ||
944 | return 0; | ||
945 | } | ||
946 | |||
947 | /** | 932 | /** |
948 | * On resume, for storing the PBE list and the image, | 933 | * On resume, for storing the PBE list and the image, |
949 | * we can only use memory pages that do not conflict with the pages | 934 | * we can only use memory pages that do not conflict with the pages |
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask) | |||
973 | unsigned long m; | 958 | unsigned long m; |
974 | 959 | ||
975 | m = get_zeroed_page(gfp_mask); | 960 | m = get_zeroed_page(gfp_mask); |
976 | while (does_collide_order(m, 0)) { | 961 | while (!PageNosaveFree(virt_to_page(m))) { |
977 | eat_page((void *)m); | 962 | eat_page((void *)m); |
978 | m = get_zeroed_page(gfp_mask); | 963 | m = get_zeroed_page(gfp_mask); |
979 | if (!m) | 964 | if (!m) |
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1045 | 1030 | ||
1046 | /* Set page flags */ | 1031 | /* Set page flags */ |
1047 | 1032 | ||
1048 | for_each_zone(zone) { | 1033 | for_each_zone (zone) { |
1049 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 1034 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
1050 | SetPageNosaveFree(pfn_to_page(zone_pfn + | 1035 | SetPageNosaveFree(pfn_to_page(zone_pfn + |
1051 | zone->zone_start_pfn)); | 1036 | zone->zone_start_pfn)); |
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1061 | /* Relocate colliding pages */ | 1046 | /* Relocate colliding pages */ |
1062 | 1047 | ||
1063 | for_each_pb_page (pbpage, pblist) { | 1048 | for_each_pb_page (pbpage, pblist) { |
1064 | if (does_collide_order((unsigned long)pbpage, 0)) { | 1049 | if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { |
1065 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | 1050 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); |
1066 | if (!m) { | 1051 | if (!m) { |
1067 | error = -ENOMEM; | 1052 | error = -ENOMEM; |
@@ -1181,9 +1166,9 @@ static int bio_write_page(pgoff_t page_off, void * page) | |||
1181 | static const char * sanity_check(void) | 1166 | static const char * sanity_check(void) |
1182 | { | 1167 | { |
1183 | dump_info(); | 1168 | dump_info(); |
1184 | if(swsusp_info.version_code != LINUX_VERSION_CODE) | 1169 | if (swsusp_info.version_code != LINUX_VERSION_CODE) |
1185 | return "kernel version"; | 1170 | return "kernel version"; |
1186 | if(swsusp_info.num_physpages != num_physpages) | 1171 | if (swsusp_info.num_physpages != num_physpages) |
1187 | return "memory size"; | 1172 | return "memory size"; |
1188 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) | 1173 | if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) |
1189 | return "system type"; | 1174 | return "system type"; |
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void) | |||
1193 | return "version"; | 1178 | return "version"; |
1194 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | 1179 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) |
1195 | return "machine"; | 1180 | return "machine"; |
1181 | #if 0 | ||
1196 | if(swsusp_info.cpus != num_online_cpus()) | 1182 | if(swsusp_info.cpus != num_online_cpus()) |
1197 | return "number of cpus"; | 1183 | return "number of cpus"; |
1184 | #endif | ||
1198 | return NULL; | 1185 | return NULL; |
1199 | } | 1186 | } |
1200 | 1187 | ||
@@ -1274,8 +1261,6 @@ static int data_read(struct pbe *pblist) | |||
1274 | return error; | 1261 | return error; |
1275 | } | 1262 | } |
1276 | 1263 | ||
1277 | extern dev_t name_to_dev_t(const char *line); | ||
1278 | |||
1279 | /** | 1264 | /** |
1280 | * read_pagedir - Read page backup list pages from swap | 1265 | * read_pagedir - Read page backup list pages from swap |
1281 | */ | 1266 | */ |
@@ -1369,16 +1354,6 @@ int swsusp_check(void) | |||
1369 | { | 1354 | { |
1370 | int error; | 1355 | int error; |
1371 | 1356 | ||
1372 | if (!swsusp_resume_device) { | ||
1373 | if (!strlen(resume_file)) | ||
1374 | return -ENOENT; | ||
1375 | swsusp_resume_device = name_to_dev_t(resume_file); | ||
1376 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); | ||
1377 | } else { | ||
1378 | pr_debug("swsusp: Resume From Partition %d:%d\n", | ||
1379 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | ||
1380 | } | ||
1381 | |||
1382 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); | 1357 | resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); |
1383 | if (!IS_ERR(resume_bdev)) { | 1358 | if (!IS_ERR(resume_bdev)) { |
1384 | set_blocksize(resume_bdev, PAGE_SIZE); | 1359 | set_blocksize(resume_bdev, PAGE_SIZE); |
diff --git a/kernel/printk.c b/kernel/printk.c index 01b58d7d17ff..5092397fac29 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -588,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
588 | log_level_unknown = 1; | 588 | log_level_unknown = 1; |
589 | } | 589 | } |
590 | 590 | ||
591 | if (!cpu_online(smp_processor_id()) && | 591 | if (!cpu_online(smp_processor_id())) { |
592 | system_state != SYSTEM_RUNNING) { | ||
593 | /* | 592 | /* |
594 | * Some console drivers may assume that per-cpu resources have | 593 | * Some console drivers may assume that per-cpu resources have |
595 | * been allocated. So don't allow them to be called by this | 594 | * been allocated. So don't allow them to be called by this |
@@ -876,8 +875,10 @@ void register_console(struct console * console) | |||
876 | break; | 875 | break; |
877 | console->flags |= CON_ENABLED; | 876 | console->flags |= CON_ENABLED; |
878 | console->index = console_cmdline[i].index; | 877 | console->index = console_cmdline[i].index; |
879 | if (i == preferred_console) | 878 | if (i == selected_console) { |
880 | console->flags |= CON_CONSDEV; | 879 | console->flags |= CON_CONSDEV; |
880 | preferred_console = selected_console; | ||
881 | } | ||
881 | break; | 882 | break; |
882 | } | 883 | } |
883 | 884 | ||
@@ -897,6 +898,8 @@ void register_console(struct console * console) | |||
897 | if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { | 898 | if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { |
898 | console->next = console_drivers; | 899 | console->next = console_drivers; |
899 | console_drivers = console; | 900 | console_drivers = console; |
901 | if (console->next) | ||
902 | console->next->flags &= ~CON_CONSDEV; | ||
900 | } else { | 903 | } else { |
901 | console->next = console_drivers->next; | 904 | console->next = console_drivers->next; |
902 | console_drivers->next = console; | 905 | console_drivers->next = console; |
@@ -937,10 +940,14 @@ int unregister_console(struct console * console) | |||
937 | /* If last console is removed, we re-enable picking the first | 940 | /* If last console is removed, we re-enable picking the first |
938 | * one that gets registered. Without that, pmac early boot console | 941 | * one that gets registered. Without that, pmac early boot console |
939 | * would prevent fbcon from taking over. | 942 | * would prevent fbcon from taking over. |
943 | * | ||
944 | * If this isn't the last console and it has CON_CONSDEV set, we | ||
945 | * need to set it on the next preferred console. | ||
940 | */ | 946 | */ |
941 | if (console_drivers == NULL) | 947 | if (console_drivers == NULL) |
942 | preferred_console = selected_console; | 948 | preferred_console = selected_console; |
943 | 949 | else if (console->flags & CON_CONSDEV) | |
950 | console_drivers->flags |= CON_CONSDEV; | ||
944 | 951 | ||
945 | release_console_sem(); | 952 | release_console_sem(); |
946 | return res; | 953 | return res; |
diff --git a/kernel/profile.c b/kernel/profile.c index ad8cbb75ffa2..f89248e6d704 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -35,11 +35,11 @@ struct profile_hit { | |||
35 | #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) | 35 | #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) |
36 | 36 | ||
37 | /* Oprofile timer tick hook */ | 37 | /* Oprofile timer tick hook */ |
38 | int (*timer_hook)(struct pt_regs *); | 38 | int (*timer_hook)(struct pt_regs *) __read_mostly; |
39 | 39 | ||
40 | static atomic_t *prof_buffer; | 40 | static atomic_t *prof_buffer; |
41 | static unsigned long prof_len, prof_shift; | 41 | static unsigned long prof_len, prof_shift; |
42 | static int prof_on; | 42 | static int prof_on __read_mostly; |
43 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | 43 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; |
44 | #ifdef CONFIG_SMP | 44 | #ifdef CONFIG_SMP |
45 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 45 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
diff --git a/kernel/resource.c b/kernel/resource.c index 52f696f11adf..26967e042201 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new, | |||
263 | new->start = min; | 263 | new->start = min; |
264 | if (new->end > max) | 264 | if (new->end > max) |
265 | new->end = max; | 265 | new->end = max; |
266 | new->start = (new->start + align - 1) & ~(align - 1); | 266 | new->start = ALIGN(new->start, align); |
267 | if (alignf) | 267 | if (alignf) |
268 | alignf(alignf_data, new, size, align); | 268 | alignf(alignf_data, new, size, align); |
269 | if (new->start < new->end && new->end - new->start >= size - 1) { | 269 | if (new->start < new->end && new->end - new->start >= size - 1) { |
diff --git a/kernel/sched.c b/kernel/sched.c index deca041fc364..5f889d0cbfcc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -166,7 +166,7 @@ | |||
166 | #define SCALE_PRIO(x, prio) \ | 166 | #define SCALE_PRIO(x, prio) \ |
167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 167 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) |
168 | 168 | ||
169 | static inline unsigned int task_timeslice(task_t *p) | 169 | static unsigned int task_timeslice(task_t *p) |
170 | { | 170 | { |
171 | if (p->static_prio < NICE_TO_PRIO(0)) | 171 | if (p->static_prio < NICE_TO_PRIO(0)) |
172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 172 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); |
@@ -206,7 +206,7 @@ struct runqueue { | |||
206 | */ | 206 | */ |
207 | unsigned long nr_running; | 207 | unsigned long nr_running; |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
209 | unsigned long cpu_load; | 209 | unsigned long cpu_load[3]; |
210 | #endif | 210 | #endif |
211 | unsigned long long nr_switches; | 211 | unsigned long long nr_switches; |
212 | 212 | ||
@@ -260,22 +260,86 @@ struct runqueue { | |||
260 | 260 | ||
261 | static DEFINE_PER_CPU(struct runqueue, runqueues); | 261 | static DEFINE_PER_CPU(struct runqueue, runqueues); |
262 | 262 | ||
263 | /* | ||
264 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | ||
265 | * See detach_destroy_domains: synchronize_sched for details. | ||
266 | * | ||
267 | * The domain tree of any CPU may only be accessed from within | ||
268 | * preempt-disabled sections. | ||
269 | */ | ||
263 | #define for_each_domain(cpu, domain) \ | 270 | #define for_each_domain(cpu, domain) \ |
264 | for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) | 271 | for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) |
265 | 272 | ||
266 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 273 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
267 | #define this_rq() (&__get_cpu_var(runqueues)) | 274 | #define this_rq() (&__get_cpu_var(runqueues)) |
268 | #define task_rq(p) cpu_rq(task_cpu(p)) | 275 | #define task_rq(p) cpu_rq(task_cpu(p)) |
269 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 276 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
270 | 277 | ||
271 | /* | ||
272 | * Default context-switch locking: | ||
273 | */ | ||
274 | #ifndef prepare_arch_switch | 278 | #ifndef prepare_arch_switch |
275 | # define prepare_arch_switch(rq, next) do { } while (0) | 279 | # define prepare_arch_switch(next) do { } while (0) |
276 | # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) | 280 | #endif |
277 | # define task_running(rq, p) ((rq)->curr == (p)) | 281 | #ifndef finish_arch_switch |
282 | # define finish_arch_switch(prev) do { } while (0) | ||
283 | #endif | ||
284 | |||
285 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | ||
286 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
287 | { | ||
288 | return rq->curr == p; | ||
289 | } | ||
290 | |||
291 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
292 | { | ||
293 | } | ||
294 | |||
295 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
296 | { | ||
297 | spin_unlock_irq(&rq->lock); | ||
298 | } | ||
299 | |||
300 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
301 | static inline int task_running(runqueue_t *rq, task_t *p) | ||
302 | { | ||
303 | #ifdef CONFIG_SMP | ||
304 | return p->oncpu; | ||
305 | #else | ||
306 | return rq->curr == p; | ||
307 | #endif | ||
308 | } | ||
309 | |||
310 | static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) | ||
311 | { | ||
312 | #ifdef CONFIG_SMP | ||
313 | /* | ||
314 | * We can optimise this out completely for !SMP, because the | ||
315 | * SMP rebalancing from interrupt is the only thing that cares | ||
316 | * here. | ||
317 | */ | ||
318 | next->oncpu = 1; | ||
278 | #endif | 319 | #endif |
320 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
321 | spin_unlock_irq(&rq->lock); | ||
322 | #else | ||
323 | spin_unlock(&rq->lock); | ||
324 | #endif | ||
325 | } | ||
326 | |||
327 | static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | ||
328 | { | ||
329 | #ifdef CONFIG_SMP | ||
330 | /* | ||
331 | * After ->oncpu is cleared, the task can be moved to a different CPU. | ||
332 | * We must ensure this doesn't happen until the switch is completely | ||
333 | * finished. | ||
334 | */ | ||
335 | smp_wmb(); | ||
336 | prev->oncpu = 0; | ||
337 | #endif | ||
338 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
339 | local_irq_enable(); | ||
340 | #endif | ||
341 | } | ||
342 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | ||
279 | 343 | ||
280 | /* | 344 | /* |
281 | * task_rq_lock - lock the runqueue a given task resides on and disable | 345 | * task_rq_lock - lock the runqueue a given task resides on and disable |
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | |||
309 | * bump this up when changing the output format or the meaning of an existing | 373 | * bump this up when changing the output format or the meaning of an existing |
310 | * format, so that tools can adapt (or abort) | 374 | * format, so that tools can adapt (or abort) |
311 | */ | 375 | */ |
312 | #define SCHEDSTAT_VERSION 11 | 376 | #define SCHEDSTAT_VERSION 12 |
313 | 377 | ||
314 | static int show_schedstat(struct seq_file *seq, void *v) | 378 | static int show_schedstat(struct seq_file *seq, void *v) |
315 | { | 379 | { |
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
338 | 402 | ||
339 | #ifdef CONFIG_SMP | 403 | #ifdef CONFIG_SMP |
340 | /* domain-specific stats */ | 404 | /* domain-specific stats */ |
405 | preempt_disable(); | ||
341 | for_each_domain(cpu, sd) { | 406 | for_each_domain(cpu, sd) { |
342 | enum idle_type itype; | 407 | enum idle_type itype; |
343 | char mask_str[NR_CPUS]; | 408 | char mask_str[NR_CPUS]; |
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
356 | sd->lb_nobusyq[itype], | 421 | sd->lb_nobusyq[itype], |
357 | sd->lb_nobusyg[itype]); | 422 | sd->lb_nobusyg[itype]); |
358 | } | 423 | } |
359 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", | 424 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", |
360 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 425 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
361 | sd->sbe_pushed, sd->sbe_attempts, | 426 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
427 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | ||
362 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 428 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); |
363 | } | 429 | } |
430 | preempt_enable(); | ||
364 | #endif | 431 | #endif |
365 | } | 432 | } |
366 | return 0; | 433 | return 0; |
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void) | |||
414 | return rq; | 481 | return rq; |
415 | } | 482 | } |
416 | 483 | ||
417 | #ifdef CONFIG_SCHED_SMT | ||
418 | static int cpu_and_siblings_are_idle(int cpu) | ||
419 | { | ||
420 | int sib; | ||
421 | for_each_cpu_mask(sib, cpu_sibling_map[cpu]) { | ||
422 | if (idle_cpu(sib)) | ||
423 | continue; | ||
424 | return 0; | ||
425 | } | ||
426 | |||
427 | return 1; | ||
428 | } | ||
429 | #else | ||
430 | #define cpu_and_siblings_are_idle(A) idle_cpu(A) | ||
431 | #endif | ||
432 | |||
433 | #ifdef CONFIG_SCHEDSTATS | 484 | #ifdef CONFIG_SCHEDSTATS |
434 | /* | 485 | /* |
435 | * Called when a process is dequeued from the active array and given | 486 | * Called when a process is dequeued from the active array and given |
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | |||
622 | rq->nr_running++; | 673 | rq->nr_running++; |
623 | } | 674 | } |
624 | 675 | ||
625 | static void recalc_task_prio(task_t *p, unsigned long long now) | 676 | static int recalc_task_prio(task_t *p, unsigned long long now) |
626 | { | 677 | { |
627 | /* Caller must always ensure 'now >= p->timestamp' */ | 678 | /* Caller must always ensure 'now >= p->timestamp' */ |
628 | unsigned long long __sleep_time = now - p->timestamp; | 679 | unsigned long long __sleep_time = now - p->timestamp; |
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) | |||
681 | } | 732 | } |
682 | } | 733 | } |
683 | 734 | ||
684 | p->prio = effective_prio(p); | 735 | return effective_prio(p); |
685 | } | 736 | } |
686 | 737 | ||
687 | /* | 738 | /* |
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
704 | } | 755 | } |
705 | #endif | 756 | #endif |
706 | 757 | ||
707 | recalc_task_prio(p, now); | 758 | p->prio = recalc_task_prio(p, now); |
708 | 759 | ||
709 | /* | 760 | /* |
710 | * This checks to make sure it's not an uninterruptible task | 761 | * This checks to make sure it's not an uninterruptible task |
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p) | |||
782 | } | 833 | } |
783 | 834 | ||
784 | #ifdef CONFIG_SMP | 835 | #ifdef CONFIG_SMP |
785 | enum request_type { | ||
786 | REQ_MOVE_TASK, | ||
787 | REQ_SET_DOMAIN, | ||
788 | }; | ||
789 | |||
790 | typedef struct { | 836 | typedef struct { |
791 | struct list_head list; | 837 | struct list_head list; |
792 | enum request_type type; | ||
793 | 838 | ||
794 | /* For REQ_MOVE_TASK */ | ||
795 | task_t *task; | 839 | task_t *task; |
796 | int dest_cpu; | 840 | int dest_cpu; |
797 | 841 | ||
798 | /* For REQ_SET_DOMAIN */ | ||
799 | struct sched_domain *sd; | ||
800 | |||
801 | struct completion done; | 842 | struct completion done; |
802 | } migration_req_t; | 843 | } migration_req_t; |
803 | 844 | ||
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
819 | } | 860 | } |
820 | 861 | ||
821 | init_completion(&req->done); | 862 | init_completion(&req->done); |
822 | req->type = REQ_MOVE_TASK; | ||
823 | req->task = p; | 863 | req->task = p; |
824 | req->dest_cpu = dest_cpu; | 864 | req->dest_cpu = dest_cpu; |
825 | list_add(&req->list, &rq->migration_queue); | 865 | list_add(&req->list, &rq->migration_queue); |
@@ -886,26 +926,154 @@ void kick_process(task_t *p) | |||
886 | * We want to under-estimate the load of migration sources, to | 926 | * We want to under-estimate the load of migration sources, to |
887 | * balance conservatively. | 927 | * balance conservatively. |
888 | */ | 928 | */ |
889 | static inline unsigned long source_load(int cpu) | 929 | static inline unsigned long source_load(int cpu, int type) |
890 | { | 930 | { |
891 | runqueue_t *rq = cpu_rq(cpu); | 931 | runqueue_t *rq = cpu_rq(cpu); |
892 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 932 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
933 | if (type == 0) | ||
934 | return load_now; | ||
893 | 935 | ||
894 | return min(rq->cpu_load, load_now); | 936 | return min(rq->cpu_load[type-1], load_now); |
895 | } | 937 | } |
896 | 938 | ||
897 | /* | 939 | /* |
898 | * Return a high guess at the load of a migration-target cpu | 940 | * Return a high guess at the load of a migration-target cpu |
899 | */ | 941 | */ |
900 | static inline unsigned long target_load(int cpu) | 942 | static inline unsigned long target_load(int cpu, int type) |
901 | { | 943 | { |
902 | runqueue_t *rq = cpu_rq(cpu); | 944 | runqueue_t *rq = cpu_rq(cpu); |
903 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 945 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; |
946 | if (type == 0) | ||
947 | return load_now; | ||
904 | 948 | ||
905 | return max(rq->cpu_load, load_now); | 949 | return max(rq->cpu_load[type-1], load_now); |
906 | } | 950 | } |
907 | 951 | ||
908 | #endif | 952 | /* |
953 | * find_idlest_group finds and returns the least busy CPU group within the | ||
954 | * domain. | ||
955 | */ | ||
956 | static struct sched_group * | ||
957 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
958 | { | ||
959 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
960 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
961 | int load_idx = sd->forkexec_idx; | ||
962 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
963 | |||
964 | do { | ||
965 | unsigned long load, avg_load; | ||
966 | int local_group; | ||
967 | int i; | ||
968 | |||
969 | local_group = cpu_isset(this_cpu, group->cpumask); | ||
970 | /* XXX: put a cpus allowed check */ | ||
971 | |||
972 | /* Tally up the load of all CPUs in the group */ | ||
973 | avg_load = 0; | ||
974 | |||
975 | for_each_cpu_mask(i, group->cpumask) { | ||
976 | /* Bias balancing toward cpus of our domain */ | ||
977 | if (local_group) | ||
978 | load = source_load(i, load_idx); | ||
979 | else | ||
980 | load = target_load(i, load_idx); | ||
981 | |||
982 | avg_load += load; | ||
983 | } | ||
984 | |||
985 | /* Adjust by relative CPU power of the group */ | ||
986 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | ||
987 | |||
988 | if (local_group) { | ||
989 | this_load = avg_load; | ||
990 | this = group; | ||
991 | } else if (avg_load < min_load) { | ||
992 | min_load = avg_load; | ||
993 | idlest = group; | ||
994 | } | ||
995 | group = group->next; | ||
996 | } while (group != sd->groups); | ||
997 | |||
998 | if (!idlest || 100*this_load < imbalance*min_load) | ||
999 | return NULL; | ||
1000 | return idlest; | ||
1001 | } | ||
1002 | |||
1003 | /* | ||
1004 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | ||
1005 | */ | ||
1006 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | ||
1007 | { | ||
1008 | unsigned long load, min_load = ULONG_MAX; | ||
1009 | int idlest = -1; | ||
1010 | int i; | ||
1011 | |||
1012 | for_each_cpu_mask(i, group->cpumask) { | ||
1013 | load = source_load(i, 0); | ||
1014 | |||
1015 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
1016 | min_load = load; | ||
1017 | idlest = i; | ||
1018 | } | ||
1019 | } | ||
1020 | |||
1021 | return idlest; | ||
1022 | } | ||
1023 | |||
1024 | /* | ||
1025 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
1026 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
1027 | * SD_BALANCE_EXEC. | ||
1028 | * | ||
1029 | * Balance, ie. select the least loaded group. | ||
1030 | * | ||
1031 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
1032 | * | ||
1033 | * preempt must be disabled. | ||
1034 | */ | ||
1035 | static int sched_balance_self(int cpu, int flag) | ||
1036 | { | ||
1037 | struct task_struct *t = current; | ||
1038 | struct sched_domain *tmp, *sd = NULL; | ||
1039 | |||
1040 | for_each_domain(cpu, tmp) | ||
1041 | if (tmp->flags & flag) | ||
1042 | sd = tmp; | ||
1043 | |||
1044 | while (sd) { | ||
1045 | cpumask_t span; | ||
1046 | struct sched_group *group; | ||
1047 | int new_cpu; | ||
1048 | int weight; | ||
1049 | |||
1050 | span = sd->span; | ||
1051 | group = find_idlest_group(sd, t, cpu); | ||
1052 | if (!group) | ||
1053 | goto nextlevel; | ||
1054 | |||
1055 | new_cpu = find_idlest_cpu(group, cpu); | ||
1056 | if (new_cpu == -1 || new_cpu == cpu) | ||
1057 | goto nextlevel; | ||
1058 | |||
1059 | /* Now try balancing at a lower domain level */ | ||
1060 | cpu = new_cpu; | ||
1061 | nextlevel: | ||
1062 | sd = NULL; | ||
1063 | weight = cpus_weight(span); | ||
1064 | for_each_domain(cpu, tmp) { | ||
1065 | if (weight <= cpus_weight(tmp->span)) | ||
1066 | break; | ||
1067 | if (tmp->flags & flag) | ||
1068 | sd = tmp; | ||
1069 | } | ||
1070 | /* while loop will break here if sd == NULL */ | ||
1071 | } | ||
1072 | |||
1073 | return cpu; | ||
1074 | } | ||
1075 | |||
1076 | #endif /* CONFIG_SMP */ | ||
909 | 1077 | ||
910 | /* | 1078 | /* |
911 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1079 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p) | |||
927 | 1095 | ||
928 | for_each_domain(cpu, sd) { | 1096 | for_each_domain(cpu, sd) { |
929 | if (sd->flags & SD_WAKE_IDLE) { | 1097 | if (sd->flags & SD_WAKE_IDLE) { |
930 | cpus_and(tmp, sd->span, cpu_online_map); | 1098 | cpus_and(tmp, sd->span, p->cpus_allowed); |
931 | cpus_and(tmp, tmp, p->cpus_allowed); | ||
932 | for_each_cpu_mask(i, tmp) { | 1099 | for_each_cpu_mask(i, tmp) { |
933 | if (idle_cpu(i)) | 1100 | if (idle_cpu(i)) |
934 | return i; | 1101 | return i; |
935 | } | 1102 | } |
936 | } | 1103 | } |
937 | else break; | 1104 | else |
1105 | break; | ||
938 | } | 1106 | } |
939 | return cpu; | 1107 | return cpu; |
940 | } | 1108 | } |
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
967 | runqueue_t *rq; | 1135 | runqueue_t *rq; |
968 | #ifdef CONFIG_SMP | 1136 | #ifdef CONFIG_SMP |
969 | unsigned long load, this_load; | 1137 | unsigned long load, this_load; |
970 | struct sched_domain *sd; | 1138 | struct sched_domain *sd, *this_sd = NULL; |
971 | int new_cpu; | 1139 | int new_cpu; |
972 | #endif | 1140 | #endif |
973 | 1141 | ||
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) | |||
986 | if (unlikely(task_running(rq, p))) | 1154 | if (unlikely(task_running(rq, p))) |
987 | goto out_activate; | 1155 | goto out_activate; |
988 | 1156 | ||
989 | #ifdef CONFIG_SCHEDSTATS | 1157 | new_cpu = cpu; |
1158 | |||
990 | schedstat_inc(rq, ttwu_cnt); | 1159 | schedstat_inc(rq, ttwu_cnt); |
991 | if (cpu == this_cpu) { | 1160 | if (cpu == this_cpu) { |
992 | schedstat_inc(rq, ttwu_local); | 1161 | schedstat_inc(rq, ttwu_local); |
993 | } else { | 1162 | goto out_set_cpu; |
994 | for_each_domain(this_cpu, sd) { | 1163 | } |
995 | if (cpu_isset(cpu, sd->span)) { | 1164 | |
996 | schedstat_inc(sd, ttwu_wake_remote); | 1165 | for_each_domain(this_cpu, sd) { |
997 | break; | 1166 | if (cpu_isset(cpu, sd->span)) { |
998 | } | 1167 | schedstat_inc(sd, ttwu_wake_remote); |
1168 | this_sd = sd; | ||
1169 | break; | ||
999 | } | 1170 | } |
1000 | } | 1171 | } |
1001 | #endif | ||
1002 | 1172 | ||
1003 | new_cpu = cpu; | 1173 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1004 | if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | 1174 | goto out_set_cpu; |
1006 | 1175 | ||
1007 | load = source_load(cpu); | ||
1008 | this_load = target_load(this_cpu); | ||
1009 | |||
1010 | /* | 1176 | /* |
1011 | * If sync wakeup then subtract the (maximum possible) effect of | 1177 | * Check for affine wakeup and passive balancing possibilities. |
1012 | * the currently running task from the load of the current CPU: | ||
1013 | */ | 1178 | */ |
1014 | if (sync) | 1179 | if (this_sd) { |
1015 | this_load -= SCHED_LOAD_SCALE; | 1180 | int idx = this_sd->wake_idx; |
1181 | unsigned int imbalance; | ||
1016 | 1182 | ||
1017 | /* Don't pull the task off an idle CPU to a busy one */ | 1183 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1018 | if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) | ||
1019 | goto out_set_cpu; | ||
1020 | 1184 | ||
1021 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1185 | load = source_load(cpu, idx); |
1186 | this_load = target_load(this_cpu, idx); | ||
1022 | 1187 | ||
1023 | /* | 1188 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
1024 | * Scan domains for affine wakeup and passive balancing | ||
1025 | * possibilities. | ||
1026 | */ | ||
1027 | for_each_domain(this_cpu, sd) { | ||
1028 | unsigned int imbalance; | ||
1029 | /* | ||
1030 | * Start passive balancing when half the imbalance_pct | ||
1031 | * limit is reached. | ||
1032 | */ | ||
1033 | imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; | ||
1034 | 1189 | ||
1035 | if ((sd->flags & SD_WAKE_AFFINE) && | 1190 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1036 | !task_hot(p, rq->timestamp_last_tick, sd)) { | 1191 | unsigned long tl = this_load; |
1037 | /* | 1192 | /* |
1038 | * This domain has SD_WAKE_AFFINE and p is cache cold | 1193 | * If sync wakeup then subtract the (maximum possible) |
1039 | * in this domain. | 1194 | * effect of the currently running task from the load |
1195 | * of the current CPU: | ||
1040 | */ | 1196 | */ |
1041 | if (cpu_isset(cpu, sd->span)) { | 1197 | if (sync) |
1042 | schedstat_inc(sd, ttwu_move_affine); | 1198 | tl -= SCHED_LOAD_SCALE; |
1199 | |||
1200 | if ((tl <= load && | ||
1201 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | ||
1202 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | ||
1203 | /* | ||
1204 | * This domain has SD_WAKE_AFFINE and | ||
1205 | * p is cache cold in this domain, and | ||
1206 | * there is no bad imbalance. | ||
1207 | */ | ||
1208 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1043 | goto out_set_cpu; | 1209 | goto out_set_cpu; |
1044 | } | 1210 | } |
1045 | } else if ((sd->flags & SD_WAKE_BALANCE) && | 1211 | } |
1046 | imbalance*this_load <= 100*load) { | 1212 | |
1047 | /* | 1213 | /* |
1048 | * This domain has SD_WAKE_BALANCE and there is | 1214 | * Start passive balancing when half the imbalance_pct |
1049 | * an imbalance. | 1215 | * limit is reached. |
1050 | */ | 1216 | */ |
1051 | if (cpu_isset(cpu, sd->span)) { | 1217 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1052 | schedstat_inc(sd, ttwu_move_balance); | 1218 | if (imbalance*this_load <= 100*load) { |
1219 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1053 | goto out_set_cpu; | 1220 | goto out_set_cpu; |
1054 | } | 1221 | } |
1055 | } | 1222 | } |
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state) | |||
1120 | return try_to_wake_up(p, state, 0); | 1287 | return try_to_wake_up(p, state, 0); |
1121 | } | 1288 | } |
1122 | 1289 | ||
1123 | #ifdef CONFIG_SMP | ||
1124 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1125 | struct sched_domain *sd); | ||
1126 | #endif | ||
1127 | |||
1128 | /* | 1290 | /* |
1129 | * Perform scheduler related setup for a newly forked process p. | 1291 | * Perform scheduler related setup for a newly forked process p. |
1130 | * p is forked by current. | 1292 | * p is forked by current. |
1131 | */ | 1293 | */ |
1132 | void fastcall sched_fork(task_t *p) | 1294 | void fastcall sched_fork(task_t *p, int clone_flags) |
1133 | { | 1295 | { |
1296 | int cpu = get_cpu(); | ||
1297 | |||
1298 | #ifdef CONFIG_SMP | ||
1299 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
1300 | #endif | ||
1301 | set_task_cpu(p, cpu); | ||
1302 | |||
1134 | /* | 1303 | /* |
1135 | * We mark the process as running here, but have not actually | 1304 | * We mark the process as running here, but have not actually |
1136 | * inserted it onto the runqueue yet. This guarantees that | 1305 | * inserted it onto the runqueue yet. This guarantees that |
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p) | |||
1140 | p->state = TASK_RUNNING; | 1309 | p->state = TASK_RUNNING; |
1141 | INIT_LIST_HEAD(&p->run_list); | 1310 | INIT_LIST_HEAD(&p->run_list); |
1142 | p->array = NULL; | 1311 | p->array = NULL; |
1143 | spin_lock_init(&p->switch_lock); | ||
1144 | #ifdef CONFIG_SCHEDSTATS | 1312 | #ifdef CONFIG_SCHEDSTATS |
1145 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1313 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1146 | #endif | 1314 | #endif |
1315 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | ||
1316 | p->oncpu = 0; | ||
1317 | #endif | ||
1147 | #ifdef CONFIG_PREEMPT | 1318 | #ifdef CONFIG_PREEMPT |
1148 | /* | 1319 | /* Want to start with kernel preemption disabled. */ |
1149 | * During context-switch we hold precisely one spinlock, which | ||
1150 | * schedule_tail drops. (in the common case it's this_rq()->lock, | ||
1151 | * but it also can be p->switch_lock.) So we compensate with a count | ||
1152 | * of 1. Also, we want to start with kernel preemption disabled. | ||
1153 | */ | ||
1154 | p->thread_info->preempt_count = 1; | 1320 | p->thread_info->preempt_count = 1; |
1155 | #endif | 1321 | #endif |
1156 | /* | 1322 | /* |
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p) | |||
1174 | * runqueue lock is not a problem. | 1340 | * runqueue lock is not a problem. |
1175 | */ | 1341 | */ |
1176 | current->time_slice = 1; | 1342 | current->time_slice = 1; |
1177 | preempt_disable(); | ||
1178 | scheduler_tick(); | 1343 | scheduler_tick(); |
1179 | local_irq_enable(); | 1344 | } |
1180 | preempt_enable(); | 1345 | local_irq_enable(); |
1181 | } else | 1346 | put_cpu(); |
1182 | local_irq_enable(); | ||
1183 | } | 1347 | } |
1184 | 1348 | ||
1185 | /* | 1349 | /* |
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
1196 | runqueue_t *rq, *this_rq; | 1360 | runqueue_t *rq, *this_rq; |
1197 | 1361 | ||
1198 | rq = task_rq_lock(p, &flags); | 1362 | rq = task_rq_lock(p, &flags); |
1199 | cpu = task_cpu(p); | ||
1200 | this_cpu = smp_processor_id(); | ||
1201 | |||
1202 | BUG_ON(p->state != TASK_RUNNING); | 1363 | BUG_ON(p->state != TASK_RUNNING); |
1364 | this_cpu = smp_processor_id(); | ||
1365 | cpu = task_cpu(p); | ||
1203 | 1366 | ||
1204 | /* | 1367 | /* |
1205 | * We decrease the sleep average of forking parents | 1368 | * We decrease the sleep average of forking parents |
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p) | |||
1296 | } | 1459 | } |
1297 | 1460 | ||
1298 | /** | 1461 | /** |
1462 | * prepare_task_switch - prepare to switch tasks | ||
1463 | * @rq: the runqueue preparing to switch | ||
1464 | * @next: the task we are going to switch to. | ||
1465 | * | ||
1466 | * This is called with the rq lock held and interrupts off. It must | ||
1467 | * be paired with a subsequent finish_task_switch after the context | ||
1468 | * switch. | ||
1469 | * | ||
1470 | * prepare_task_switch sets up locking and calls architecture specific | ||
1471 | * hooks. | ||
1472 | */ | ||
1473 | static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | ||
1474 | { | ||
1475 | prepare_lock_switch(rq, next); | ||
1476 | prepare_arch_switch(next); | ||
1477 | } | ||
1478 | |||
1479 | /** | ||
1299 | * finish_task_switch - clean up after a task-switch | 1480 | * finish_task_switch - clean up after a task-switch |
1300 | * @prev: the thread we just switched away from. | 1481 | * @prev: the thread we just switched away from. |
1301 | * | 1482 | * |
1302 | * We enter this with the runqueue still locked, and finish_arch_switch() | 1483 | * finish_task_switch must be called after the context switch, paired |
1303 | * will unlock it along with doing any other architecture-specific cleanup | 1484 | * with a prepare_task_switch call before the context switch. |
1304 | * actions. | 1485 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
1486 | * and do any other architecture-specific cleanup actions. | ||
1305 | * | 1487 | * |
1306 | * Note that we may have delayed dropping an mm in context_switch(). If | 1488 | * Note that we may have delayed dropping an mm in context_switch(). If |
1307 | * so, we finish that here outside of the runqueue lock. (Doing it | 1489 | * so, we finish that here outside of the runqueue lock. (Doing it |
1308 | * with the lock held can cause deadlocks; see schedule() for | 1490 | * with the lock held can cause deadlocks; see schedule() for |
1309 | * details.) | 1491 | * details.) |
1310 | */ | 1492 | */ |
1311 | static inline void finish_task_switch(task_t *prev) | 1493 | static inline void finish_task_switch(runqueue_t *rq, task_t *prev) |
1312 | __releases(rq->lock) | 1494 | __releases(rq->lock) |
1313 | { | 1495 | { |
1314 | runqueue_t *rq = this_rq(); | ||
1315 | struct mm_struct *mm = rq->prev_mm; | 1496 | struct mm_struct *mm = rq->prev_mm; |
1316 | unsigned long prev_task_flags; | 1497 | unsigned long prev_task_flags; |
1317 | 1498 | ||
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev) | |||
1329 | * Manfred Spraul <manfred@colorfullife.com> | 1510 | * Manfred Spraul <manfred@colorfullife.com> |
1330 | */ | 1511 | */ |
1331 | prev_task_flags = prev->flags; | 1512 | prev_task_flags = prev->flags; |
1332 | finish_arch_switch(rq, prev); | 1513 | finish_arch_switch(prev); |
1514 | finish_lock_switch(rq, prev); | ||
1333 | if (mm) | 1515 | if (mm) |
1334 | mmdrop(mm); | 1516 | mmdrop(mm); |
1335 | if (unlikely(prev_task_flags & PF_DEAD)) | 1517 | if (unlikely(prev_task_flags & PF_DEAD)) |
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev) | |||
1343 | asmlinkage void schedule_tail(task_t *prev) | 1525 | asmlinkage void schedule_tail(task_t *prev) |
1344 | __releases(rq->lock) | 1526 | __releases(rq->lock) |
1345 | { | 1527 | { |
1346 | finish_task_switch(prev); | 1528 | runqueue_t *rq = this_rq(); |
1347 | 1529 | finish_task_switch(rq, prev); | |
1530 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | ||
1531 | /* In this case, finish_task_switch does not reenable preemption */ | ||
1532 | preempt_enable(); | ||
1533 | #endif | ||
1348 | if (current->set_child_tid) | 1534 | if (current->set_child_tid) |
1349 | put_user(current->pid, current->set_child_tid); | 1535 | put_user(current->pid, current->set_child_tid); |
1350 | } | 1536 | } |
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1494 | } | 1680 | } |
1495 | 1681 | ||
1496 | /* | 1682 | /* |
1497 | * find_idlest_cpu - find the least busy runqueue. | ||
1498 | */ | ||
1499 | static int find_idlest_cpu(struct task_struct *p, int this_cpu, | ||
1500 | struct sched_domain *sd) | ||
1501 | { | ||
1502 | unsigned long load, min_load, this_load; | ||
1503 | int i, min_cpu; | ||
1504 | cpumask_t mask; | ||
1505 | |||
1506 | min_cpu = UINT_MAX; | ||
1507 | min_load = ULONG_MAX; | ||
1508 | |||
1509 | cpus_and(mask, sd->span, p->cpus_allowed); | ||
1510 | |||
1511 | for_each_cpu_mask(i, mask) { | ||
1512 | load = target_load(i); | ||
1513 | |||
1514 | if (load < min_load) { | ||
1515 | min_cpu = i; | ||
1516 | min_load = load; | ||
1517 | |||
1518 | /* break out early on an idle CPU: */ | ||
1519 | if (!min_load) | ||
1520 | break; | ||
1521 | } | ||
1522 | } | ||
1523 | |||
1524 | /* add +1 to account for the new task */ | ||
1525 | this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; | ||
1526 | |||
1527 | /* | ||
1528 | * Would with the addition of the new task to the | ||
1529 | * current CPU there be an imbalance between this | ||
1530 | * CPU and the idlest CPU? | ||
1531 | * | ||
1532 | * Use half of the balancing threshold - new-context is | ||
1533 | * a good opportunity to balance. | ||
1534 | */ | ||
1535 | if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) | ||
1536 | return min_cpu; | ||
1537 | |||
1538 | return this_cpu; | ||
1539 | } | ||
1540 | |||
1541 | /* | ||
1542 | * If dest_cpu is allowed for this process, migrate the task to it. | 1683 | * If dest_cpu is allowed for this process, migrate the task to it. |
1543 | * This is accomplished by forcing the cpu_allowed mask to only | 1684 | * This is accomplished by forcing the cpu_allowed mask to only |
1544 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 1685 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
@@ -1571,37 +1712,16 @@ out: | |||
1571 | } | 1712 | } |
1572 | 1713 | ||
1573 | /* | 1714 | /* |
1574 | * sched_exec(): find the highest-level, exec-balance-capable | 1715 | * sched_exec - execve() is a valuable balancing opportunity, because at |
1575 | * domain and try to migrate the task to the least loaded CPU. | 1716 | * this point the task has the smallest effective memory and cache footprint. |
1576 | * | ||
1577 | * execve() is a valuable balancing opportunity, because at this point | ||
1578 | * the task has the smallest effective memory and cache footprint. | ||
1579 | */ | 1717 | */ |
1580 | void sched_exec(void) | 1718 | void sched_exec(void) |
1581 | { | 1719 | { |
1582 | struct sched_domain *tmp, *sd = NULL; | ||
1583 | int new_cpu, this_cpu = get_cpu(); | 1720 | int new_cpu, this_cpu = get_cpu(); |
1584 | 1721 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | |
1585 | /* Prefer the current CPU if there's only this task running */ | ||
1586 | if (this_rq()->nr_running <= 1) | ||
1587 | goto out; | ||
1588 | |||
1589 | for_each_domain(this_cpu, tmp) | ||
1590 | if (tmp->flags & SD_BALANCE_EXEC) | ||
1591 | sd = tmp; | ||
1592 | |||
1593 | if (sd) { | ||
1594 | schedstat_inc(sd, sbe_attempts); | ||
1595 | new_cpu = find_idlest_cpu(current, this_cpu, sd); | ||
1596 | if (new_cpu != this_cpu) { | ||
1597 | schedstat_inc(sd, sbe_pushed); | ||
1598 | put_cpu(); | ||
1599 | sched_migrate_task(current, new_cpu); | ||
1600 | return; | ||
1601 | } | ||
1602 | } | ||
1603 | out: | ||
1604 | put_cpu(); | 1722 | put_cpu(); |
1723 | if (new_cpu != this_cpu) | ||
1724 | sched_migrate_task(current, new_cpu); | ||
1605 | } | 1725 | } |
1606 | 1726 | ||
1607 | /* | 1727 | /* |
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1632 | */ | 1752 | */ |
1633 | static inline | 1753 | static inline |
1634 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1754 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1635 | struct sched_domain *sd, enum idle_type idle) | 1755 | struct sched_domain *sd, enum idle_type idle, int *all_pinned) |
1636 | { | 1756 | { |
1637 | /* | 1757 | /* |
1638 | * We do not migrate tasks that are: | 1758 | * We do not migrate tasks that are: |
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1640 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 1760 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
1641 | * 3) are cache-hot on their current CPU. | 1761 | * 3) are cache-hot on their current CPU. |
1642 | */ | 1762 | */ |
1643 | if (task_running(rq, p)) | ||
1644 | return 0; | ||
1645 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 1763 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
1646 | return 0; | 1764 | return 0; |
1765 | *all_pinned = 0; | ||
1766 | |||
1767 | if (task_running(rq, p)) | ||
1768 | return 0; | ||
1647 | 1769 | ||
1648 | /* | 1770 | /* |
1649 | * Aggressive migration if: | 1771 | * Aggressive migration if: |
1650 | * 1) the [whole] cpu is idle, or | 1772 | * 1) task is cache cold, or |
1651 | * 2) too many balance attempts have failed. | 1773 | * 2) too many balance attempts have failed. |
1652 | */ | 1774 | */ |
1653 | 1775 | ||
1654 | if (cpu_and_siblings_are_idle(this_cpu) || \ | 1776 | if (sd->nr_balance_failed > sd->cache_nice_tries) |
1655 | sd->nr_balance_failed > sd->cache_nice_tries) | ||
1656 | return 1; | 1777 | return 1; |
1657 | 1778 | ||
1658 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 1779 | if (task_hot(p, rq->timestamp_last_tick, sd)) |
1659 | return 0; | 1780 | return 0; |
1660 | return 1; | 1781 | return 1; |
1661 | } | 1782 | } |
1662 | 1783 | ||
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1669 | */ | 1790 | */ |
1670 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 1791 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1671 | unsigned long max_nr_move, struct sched_domain *sd, | 1792 | unsigned long max_nr_move, struct sched_domain *sd, |
1672 | enum idle_type idle) | 1793 | enum idle_type idle, int *all_pinned) |
1673 | { | 1794 | { |
1674 | prio_array_t *array, *dst_array; | 1795 | prio_array_t *array, *dst_array; |
1675 | struct list_head *head, *curr; | 1796 | struct list_head *head, *curr; |
1676 | int idx, pulled = 0; | 1797 | int idx, pulled = 0, pinned = 0; |
1677 | task_t *tmp; | 1798 | task_t *tmp; |
1678 | 1799 | ||
1679 | if (max_nr_move <= 0 || busiest->nr_running <= 1) | 1800 | if (max_nr_move == 0) |
1680 | goto out; | 1801 | goto out; |
1681 | 1802 | ||
1803 | pinned = 1; | ||
1804 | |||
1682 | /* | 1805 | /* |
1683 | * We first consider expired tasks. Those will likely not be | 1806 | * We first consider expired tasks. Those will likely not be |
1684 | * executed in the near future, and they are most likely to | 1807 | * executed in the near future, and they are most likely to |
@@ -1717,7 +1840,7 @@ skip_queue: | |||
1717 | 1840 | ||
1718 | curr = curr->prev; | 1841 | curr = curr->prev; |
1719 | 1842 | ||
1720 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { | 1843 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { |
1721 | if (curr != head) | 1844 | if (curr != head) |
1722 | goto skip_queue; | 1845 | goto skip_queue; |
1723 | idx++; | 1846 | idx++; |
@@ -1746,6 +1869,9 @@ out: | |||
1746 | * inside pull_task(). | 1869 | * inside pull_task(). |
1747 | */ | 1870 | */ |
1748 | schedstat_add(sd, lb_gained[idle], pulled); | 1871 | schedstat_add(sd, lb_gained[idle], pulled); |
1872 | |||
1873 | if (all_pinned) | ||
1874 | *all_pinned = pinned; | ||
1749 | return pulled; | 1875 | return pulled; |
1750 | } | 1876 | } |
1751 | 1877 | ||
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1760 | { | 1886 | { |
1761 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1887 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1762 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1888 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1889 | int load_idx; | ||
1763 | 1890 | ||
1764 | max_load = this_load = total_load = total_pwr = 0; | 1891 | max_load = this_load = total_load = total_pwr = 0; |
1892 | if (idle == NOT_IDLE) | ||
1893 | load_idx = sd->busy_idx; | ||
1894 | else if (idle == NEWLY_IDLE) | ||
1895 | load_idx = sd->newidle_idx; | ||
1896 | else | ||
1897 | load_idx = sd->idle_idx; | ||
1765 | 1898 | ||
1766 | do { | 1899 | do { |
1767 | unsigned long load; | 1900 | unsigned long load; |
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1776 | for_each_cpu_mask(i, group->cpumask) { | 1909 | for_each_cpu_mask(i, group->cpumask) { |
1777 | /* Bias balancing toward cpus of our domain */ | 1910 | /* Bias balancing toward cpus of our domain */ |
1778 | if (local_group) | 1911 | if (local_group) |
1779 | load = target_load(i); | 1912 | load = target_load(i, load_idx); |
1780 | else | 1913 | else |
1781 | load = source_load(i); | 1914 | load = source_load(i, load_idx); |
1782 | 1915 | ||
1783 | avg_load += load; | 1916 | avg_load += load; |
1784 | } | 1917 | } |
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1792 | if (local_group) { | 1925 | if (local_group) { |
1793 | this_load = avg_load; | 1926 | this_load = avg_load; |
1794 | this = group; | 1927 | this = group; |
1795 | goto nextgroup; | ||
1796 | } else if (avg_load > max_load) { | 1928 | } else if (avg_load > max_load) { |
1797 | max_load = avg_load; | 1929 | max_load = avg_load; |
1798 | busiest = group; | 1930 | busiest = group; |
1799 | } | 1931 | } |
1800 | nextgroup: | ||
1801 | group = group->next; | 1932 | group = group->next; |
1802 | } while (group != sd->groups); | 1933 | } while (group != sd->groups); |
1803 | 1934 | ||
@@ -1870,15 +2001,9 @@ nextgroup: | |||
1870 | 2001 | ||
1871 | /* Get rid of the scaling factor, rounding down as we divide */ | 2002 | /* Get rid of the scaling factor, rounding down as we divide */ |
1872 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | 2003 | *imbalance = *imbalance / SCHED_LOAD_SCALE; |
1873 | |||
1874 | return busiest; | 2004 | return busiest; |
1875 | 2005 | ||
1876 | out_balanced: | 2006 | out_balanced: |
1877 | if (busiest && (idle == NEWLY_IDLE || | ||
1878 | (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { | ||
1879 | *imbalance = 1; | ||
1880 | return busiest; | ||
1881 | } | ||
1882 | 2007 | ||
1883 | *imbalance = 0; | 2008 | *imbalance = 0; |
1884 | return NULL; | 2009 | return NULL; |
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
1894 | int i; | 2019 | int i; |
1895 | 2020 | ||
1896 | for_each_cpu_mask(i, group->cpumask) { | 2021 | for_each_cpu_mask(i, group->cpumask) { |
1897 | load = source_load(i); | 2022 | load = source_load(i, 0); |
1898 | 2023 | ||
1899 | if (load > max_load) { | 2024 | if (load > max_load) { |
1900 | max_load = load; | 2025 | max_load = load; |
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) | |||
1906 | } | 2031 | } |
1907 | 2032 | ||
1908 | /* | 2033 | /* |
2034 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | ||
2035 | * so long as it is large enough. | ||
2036 | */ | ||
2037 | #define MAX_PINNED_INTERVAL 512 | ||
2038 | |||
2039 | /* | ||
1909 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2040 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
1910 | * tasks if there is an imbalance. | 2041 | * tasks if there is an imbalance. |
1911 | * | 2042 | * |
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1917 | struct sched_group *group; | 2048 | struct sched_group *group; |
1918 | runqueue_t *busiest; | 2049 | runqueue_t *busiest; |
1919 | unsigned long imbalance; | 2050 | unsigned long imbalance; |
1920 | int nr_moved; | 2051 | int nr_moved, all_pinned = 0; |
2052 | int active_balance = 0; | ||
1921 | 2053 | ||
1922 | spin_lock(&this_rq->lock); | 2054 | spin_lock(&this_rq->lock); |
1923 | schedstat_inc(sd, lb_cnt[idle]); | 2055 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1934 | goto out_balanced; | 2066 | goto out_balanced; |
1935 | } | 2067 | } |
1936 | 2068 | ||
1937 | /* | 2069 | BUG_ON(busiest == this_rq); |
1938 | * This should be "impossible", but since load | ||
1939 | * balancing is inherently racy and statistical, | ||
1940 | * it could happen in theory. | ||
1941 | */ | ||
1942 | if (unlikely(busiest == this_rq)) { | ||
1943 | WARN_ON(1); | ||
1944 | goto out_balanced; | ||
1945 | } | ||
1946 | 2070 | ||
1947 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2071 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
1948 | 2072 | ||
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1956 | */ | 2080 | */ |
1957 | double_lock_balance(this_rq, busiest); | 2081 | double_lock_balance(this_rq, busiest); |
1958 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2082 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
1959 | imbalance, sd, idle); | 2083 | imbalance, sd, idle, |
2084 | &all_pinned); | ||
1960 | spin_unlock(&busiest->lock); | 2085 | spin_unlock(&busiest->lock); |
2086 | |||
2087 | /* All tasks on this runqueue were pinned by CPU affinity */ | ||
2088 | if (unlikely(all_pinned)) | ||
2089 | goto out_balanced; | ||
1961 | } | 2090 | } |
2091 | |||
1962 | spin_unlock(&this_rq->lock); | 2092 | spin_unlock(&this_rq->lock); |
1963 | 2093 | ||
1964 | if (!nr_moved) { | 2094 | if (!nr_moved) { |
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
1966 | sd->nr_balance_failed++; | 2096 | sd->nr_balance_failed++; |
1967 | 2097 | ||
1968 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2098 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
1969 | int wake = 0; | ||
1970 | 2099 | ||
1971 | spin_lock(&busiest->lock); | 2100 | spin_lock(&busiest->lock); |
1972 | if (!busiest->active_balance) { | 2101 | if (!busiest->active_balance) { |
1973 | busiest->active_balance = 1; | 2102 | busiest->active_balance = 1; |
1974 | busiest->push_cpu = this_cpu; | 2103 | busiest->push_cpu = this_cpu; |
1975 | wake = 1; | 2104 | active_balance = 1; |
1976 | } | 2105 | } |
1977 | spin_unlock(&busiest->lock); | 2106 | spin_unlock(&busiest->lock); |
1978 | if (wake) | 2107 | if (active_balance) |
1979 | wake_up_process(busiest->migration_thread); | 2108 | wake_up_process(busiest->migration_thread); |
1980 | 2109 | ||
1981 | /* | 2110 | /* |
1982 | * We've kicked active balancing, reset the failure | 2111 | * We've kicked active balancing, reset the failure |
1983 | * counter. | 2112 | * counter. |
1984 | */ | 2113 | */ |
1985 | sd->nr_balance_failed = sd->cache_nice_tries; | 2114 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
1986 | } | 2115 | } |
1987 | 2116 | } else | |
1988 | /* | ||
1989 | * We were unbalanced, but unsuccessful in move_tasks(), | ||
1990 | * so bump the balance_interval to lessen the lock contention. | ||
1991 | */ | ||
1992 | if (sd->balance_interval < sd->max_interval) | ||
1993 | sd->balance_interval++; | ||
1994 | } else { | ||
1995 | sd->nr_balance_failed = 0; | 2117 | sd->nr_balance_failed = 0; |
1996 | 2118 | ||
2119 | if (likely(!active_balance)) { | ||
1997 | /* We were unbalanced, so reset the balancing interval */ | 2120 | /* We were unbalanced, so reset the balancing interval */ |
1998 | sd->balance_interval = sd->min_interval; | 2121 | sd->balance_interval = sd->min_interval; |
2122 | } else { | ||
2123 | /* | ||
2124 | * If we've begun active balancing, start to back off. This | ||
2125 | * case may not be covered by the all_pinned logic if there | ||
2126 | * is only 1 task on the busy runqueue (because we don't call | ||
2127 | * move_tasks). | ||
2128 | */ | ||
2129 | if (sd->balance_interval < sd->max_interval) | ||
2130 | sd->balance_interval *= 2; | ||
1999 | } | 2131 | } |
2000 | 2132 | ||
2001 | return nr_moved; | 2133 | return nr_moved; |
@@ -2005,8 +2137,10 @@ out_balanced: | |||
2005 | 2137 | ||
2006 | schedstat_inc(sd, lb_balanced[idle]); | 2138 | schedstat_inc(sd, lb_balanced[idle]); |
2007 | 2139 | ||
2140 | sd->nr_balance_failed = 0; | ||
2008 | /* tune up the balancing interval */ | 2141 | /* tune up the balancing interval */ |
2009 | if (sd->balance_interval < sd->max_interval) | 2142 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2143 | (sd->balance_interval < sd->max_interval)) | ||
2010 | sd->balance_interval *= 2; | 2144 | sd->balance_interval *= 2; |
2011 | 2145 | ||
2012 | return 0; | 2146 | return 0; |
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2030 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2164 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2031 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | 2165 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); |
2032 | if (!group) { | 2166 | if (!group) { |
2033 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2034 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2167 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2035 | goto out; | 2168 | goto out_balanced; |
2036 | } | 2169 | } |
2037 | 2170 | ||
2038 | busiest = find_busiest_queue(group); | 2171 | busiest = find_busiest_queue(group); |
2039 | if (!busiest || busiest == this_rq) { | 2172 | if (!busiest) { |
2040 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2041 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2173 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2042 | goto out; | 2174 | goto out_balanced; |
2043 | } | 2175 | } |
2044 | 2176 | ||
2177 | BUG_ON(busiest == this_rq); | ||
2178 | |||
2045 | /* Attempt to move tasks */ | 2179 | /* Attempt to move tasks */ |
2046 | double_lock_balance(this_rq, busiest); | 2180 | double_lock_balance(this_rq, busiest); |
2047 | 2181 | ||
2048 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2182 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
2049 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2183 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2050 | imbalance, sd, NEWLY_IDLE); | 2184 | imbalance, sd, NEWLY_IDLE, NULL); |
2051 | if (!nr_moved) | 2185 | if (!nr_moved) |
2052 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2186 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2187 | else | ||
2188 | sd->nr_balance_failed = 0; | ||
2053 | 2189 | ||
2054 | spin_unlock(&busiest->lock); | 2190 | spin_unlock(&busiest->lock); |
2055 | |||
2056 | out: | ||
2057 | return nr_moved; | 2191 | return nr_moved; |
2192 | |||
2193 | out_balanced: | ||
2194 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | ||
2195 | sd->nr_balance_failed = 0; | ||
2196 | return 0; | ||
2058 | } | 2197 | } |
2059 | 2198 | ||
2060 | /* | 2199 | /* |
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) | |||
2086 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | 2225 | static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) |
2087 | { | 2226 | { |
2088 | struct sched_domain *sd; | 2227 | struct sched_domain *sd; |
2089 | struct sched_group *cpu_group; | ||
2090 | runqueue_t *target_rq; | 2228 | runqueue_t *target_rq; |
2091 | cpumask_t visited_cpus; | 2229 | int target_cpu = busiest_rq->push_cpu; |
2092 | int cpu; | 2230 | |
2231 | if (busiest_rq->nr_running <= 1) | ||
2232 | /* no task to move */ | ||
2233 | return; | ||
2234 | |||
2235 | target_rq = cpu_rq(target_cpu); | ||
2093 | 2236 | ||
2094 | /* | 2237 | /* |
2095 | * Search for suitable CPUs to push tasks to in successively higher | 2238 | * This condition is "impossible", if it occurs |
2096 | * domains with SD_LOAD_BALANCE set. | 2239 | * we need to fix it. Originally reported by |
2240 | * Bjorn Helgaas on a 128-cpu setup. | ||
2097 | */ | 2241 | */ |
2098 | visited_cpus = CPU_MASK_NONE; | 2242 | BUG_ON(busiest_rq == target_rq); |
2099 | for_each_domain(busiest_cpu, sd) { | ||
2100 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
2101 | /* no more domains to search */ | ||
2102 | break; | ||
2103 | 2243 | ||
2104 | schedstat_inc(sd, alb_cnt); | 2244 | /* move a task from busiest_rq to target_rq */ |
2245 | double_lock_balance(busiest_rq, target_rq); | ||
2105 | 2246 | ||
2106 | cpu_group = sd->groups; | 2247 | /* Search for an sd spanning us and the target CPU. */ |
2107 | do { | 2248 | for_each_domain(target_cpu, sd) |
2108 | for_each_cpu_mask(cpu, cpu_group->cpumask) { | 2249 | if ((sd->flags & SD_LOAD_BALANCE) && |
2109 | if (busiest_rq->nr_running <= 1) | 2250 | cpu_isset(busiest_cpu, sd->span)) |
2110 | /* no more tasks left to move */ | 2251 | break; |
2111 | return; | 2252 | |
2112 | if (cpu_isset(cpu, visited_cpus)) | 2253 | if (unlikely(sd == NULL)) |
2113 | continue; | 2254 | goto out; |
2114 | cpu_set(cpu, visited_cpus); | 2255 | |
2115 | if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) | 2256 | schedstat_inc(sd, alb_cnt); |
2116 | continue; | 2257 | |
2117 | 2258 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | |
2118 | target_rq = cpu_rq(cpu); | 2259 | schedstat_inc(sd, alb_pushed); |
2119 | /* | 2260 | else |
2120 | * This condition is "impossible", if it occurs | 2261 | schedstat_inc(sd, alb_failed); |
2121 | * we need to fix it. Originally reported by | 2262 | out: |
2122 | * Bjorn Helgaas on a 128-cpu setup. | 2263 | spin_unlock(&target_rq->lock); |
2123 | */ | ||
2124 | BUG_ON(busiest_rq == target_rq); | ||
2125 | |||
2126 | /* move a task from busiest_rq to target_rq */ | ||
2127 | double_lock_balance(busiest_rq, target_rq); | ||
2128 | if (move_tasks(target_rq, cpu, busiest_rq, | ||
2129 | 1, sd, SCHED_IDLE)) { | ||
2130 | schedstat_inc(sd, alb_pushed); | ||
2131 | } else { | ||
2132 | schedstat_inc(sd, alb_failed); | ||
2133 | } | ||
2134 | spin_unlock(&target_rq->lock); | ||
2135 | } | ||
2136 | cpu_group = cpu_group->next; | ||
2137 | } while (cpu_group != sd->groups); | ||
2138 | } | ||
2139 | } | 2264 | } |
2140 | 2265 | ||
2141 | /* | 2266 | /* |
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2156 | unsigned long old_load, this_load; | 2281 | unsigned long old_load, this_load; |
2157 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); | 2282 | unsigned long j = jiffies + CPU_OFFSET(this_cpu); |
2158 | struct sched_domain *sd; | 2283 | struct sched_domain *sd; |
2284 | int i; | ||
2159 | 2285 | ||
2160 | /* Update our load */ | ||
2161 | old_load = this_rq->cpu_load; | ||
2162 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2286 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; |
2163 | /* | 2287 | /* Update our load */ |
2164 | * Round up the averaging division if load is increasing. This | 2288 | for (i = 0; i < 3; i++) { |
2165 | * prevents us from getting stuck on 9 if the load is 10, for | 2289 | unsigned long new_load = this_load; |
2166 | * example. | 2290 | int scale = 1 << i; |
2167 | */ | 2291 | old_load = this_rq->cpu_load[i]; |
2168 | if (this_load > old_load) | 2292 | /* |
2169 | old_load++; | 2293 | * Round up the averaging division if load is increasing. This |
2170 | this_rq->cpu_load = (old_load + this_load) / 2; | 2294 | * prevents us from getting stuck on 9 if the load is 10, for |
2295 | * example. | ||
2296 | */ | ||
2297 | if (new_load > old_load) | ||
2298 | new_load += scale-1; | ||
2299 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | ||
2300 | } | ||
2171 | 2301 | ||
2172 | for_each_domain(this_cpu, sd) { | 2302 | for_each_domain(this_cpu, sd) { |
2173 | unsigned long interval; | 2303 | unsigned long interval; |
@@ -2447,11 +2577,15 @@ out: | |||
2447 | #ifdef CONFIG_SCHED_SMT | 2577 | #ifdef CONFIG_SCHED_SMT |
2448 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2578 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2449 | { | 2579 | { |
2450 | struct sched_domain *sd = this_rq->sd; | 2580 | struct sched_domain *tmp, *sd = NULL; |
2451 | cpumask_t sibling_map; | 2581 | cpumask_t sibling_map; |
2452 | int i; | 2582 | int i; |
2453 | 2583 | ||
2454 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2584 | for_each_domain(this_cpu, tmp) |
2585 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
2586 | sd = tmp; | ||
2587 | |||
2588 | if (!sd) | ||
2455 | return; | 2589 | return; |
2456 | 2590 | ||
2457 | /* | 2591 | /* |
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2492 | 2626 | ||
2493 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2627 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2494 | { | 2628 | { |
2495 | struct sched_domain *sd = this_rq->sd; | 2629 | struct sched_domain *tmp, *sd = NULL; |
2496 | cpumask_t sibling_map; | 2630 | cpumask_t sibling_map; |
2497 | prio_array_t *array; | 2631 | prio_array_t *array; |
2498 | int ret = 0, i; | 2632 | int ret = 0, i; |
2499 | task_t *p; | 2633 | task_t *p; |
2500 | 2634 | ||
2501 | if (!(sd->flags & SD_SHARE_CPUPOWER)) | 2635 | for_each_domain(this_cpu, tmp) |
2636 | if (tmp->flags & SD_SHARE_CPUPOWER) | ||
2637 | sd = tmp; | ||
2638 | |||
2639 | if (!sd) | ||
2502 | return 0; | 2640 | return 0; |
2503 | 2641 | ||
2504 | /* | 2642 | /* |
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val) | |||
2576 | /* | 2714 | /* |
2577 | * Underflow? | 2715 | * Underflow? |
2578 | */ | 2716 | */ |
2579 | BUG_ON(((int)preempt_count() < 0)); | 2717 | BUG_ON((preempt_count() < 0)); |
2580 | preempt_count() += val; | 2718 | preempt_count() += val; |
2581 | /* | 2719 | /* |
2582 | * Spinlock count overflowing soon? | 2720 | * Spinlock count overflowing soon? |
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void) | |||
2613 | struct list_head *queue; | 2751 | struct list_head *queue; |
2614 | unsigned long long now; | 2752 | unsigned long long now; |
2615 | unsigned long run_time; | 2753 | unsigned long run_time; |
2616 | int cpu, idx; | 2754 | int cpu, idx, new_prio; |
2617 | 2755 | ||
2618 | /* | 2756 | /* |
2619 | * Test if we are atomic. Since do_exit() needs to call into | 2757 | * Test if we are atomic. Since do_exit() needs to call into |
@@ -2735,9 +2873,14 @@ go_idle: | |||
2735 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 2873 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
2736 | 2874 | ||
2737 | array = next->array; | 2875 | array = next->array; |
2738 | dequeue_task(next, array); | 2876 | new_prio = recalc_task_prio(next, next->timestamp + delta); |
2739 | recalc_task_prio(next, next->timestamp + delta); | 2877 | |
2740 | enqueue_task(next, array); | 2878 | if (unlikely(next->prio != new_prio)) { |
2879 | dequeue_task(next, array); | ||
2880 | next->prio = new_prio; | ||
2881 | enqueue_task(next, array); | ||
2882 | } else | ||
2883 | requeue_task(next, array); | ||
2741 | } | 2884 | } |
2742 | next->activated = 0; | 2885 | next->activated = 0; |
2743 | switch_tasks: | 2886 | switch_tasks: |
@@ -2761,11 +2904,15 @@ switch_tasks: | |||
2761 | rq->curr = next; | 2904 | rq->curr = next; |
2762 | ++*switch_count; | 2905 | ++*switch_count; |
2763 | 2906 | ||
2764 | prepare_arch_switch(rq, next); | 2907 | prepare_task_switch(rq, next); |
2765 | prev = context_switch(rq, prev, next); | 2908 | prev = context_switch(rq, prev, next); |
2766 | barrier(); | 2909 | barrier(); |
2767 | 2910 | /* | |
2768 | finish_task_switch(prev); | 2911 | * this_rq must be evaluated again because prev may have moved |
2912 | * CPUs since it called schedule(), thus the 'rq' on its stack | ||
2913 | * frame will be invalid. | ||
2914 | */ | ||
2915 | finish_task_switch(this_rq(), prev); | ||
2769 | } else | 2916 | } else |
2770 | spin_unlock_irq(&rq->lock); | 2917 | spin_unlock_irq(&rq->lock); |
2771 | 2918 | ||
@@ -2869,7 +3016,7 @@ need_resched: | |||
2869 | 3016 | ||
2870 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | 3017 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) |
2871 | { | 3018 | { |
2872 | task_t *p = curr->task; | 3019 | task_t *p = curr->private; |
2873 | return try_to_wake_up(p, mode, sync); | 3020 | return try_to_wake_up(p, mode, sync); |
2874 | } | 3021 | } |
2875 | 3022 | ||
@@ -3231,8 +3378,8 @@ EXPORT_SYMBOL(set_user_nice); | |||
3231 | */ | 3378 | */ |
3232 | int can_nice(const task_t *p, const int nice) | 3379 | int can_nice(const task_t *p, const int nice) |
3233 | { | 3380 | { |
3234 | /* convert nice value [19,-20] to rlimit style value [0,39] */ | 3381 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3235 | int nice_rlim = 19 - nice; | 3382 | int nice_rlim = 20 - nice; |
3236 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3383 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3237 | capable(CAP_SYS_NICE)); | 3384 | capable(CAP_SYS_NICE)); |
3238 | } | 3385 | } |
@@ -3301,15 +3448,7 @@ int task_nice(const task_t *p) | |||
3301 | { | 3448 | { |
3302 | return TASK_NICE(p); | 3449 | return TASK_NICE(p); |
3303 | } | 3450 | } |
3304 | |||
3305 | /* | ||
3306 | * The only users of task_nice are binfmt_elf and binfmt_elf32. | ||
3307 | * binfmt_elf is no longer modular, but binfmt_elf32 still is. | ||
3308 | * Therefore, task_nice is needed if there is a compat_mode. | ||
3309 | */ | ||
3310 | #ifdef CONFIG_COMPAT | ||
3311 | EXPORT_SYMBOL_GPL(task_nice); | 3451 | EXPORT_SYMBOL_GPL(task_nice); |
3312 | #endif | ||
3313 | 3452 | ||
3314 | /** | 3453 | /** |
3315 | * idle_cpu - is a given cpu idle currently? | 3454 | * idle_cpu - is a given cpu idle currently? |
@@ -3347,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3347 | p->policy = policy; | 3486 | p->policy = policy; |
3348 | p->rt_priority = prio; | 3487 | p->rt_priority = prio; |
3349 | if (policy != SCHED_NORMAL) | 3488 | if (policy != SCHED_NORMAL) |
3350 | p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; | 3489 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; |
3351 | else | 3490 | else |
3352 | p->prio = p->static_prio; | 3491 | p->prio = p->static_prio; |
3353 | } | 3492 | } |
@@ -3379,18 +3518,31 @@ recheck: | |||
3379 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3518 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. |
3380 | */ | 3519 | */ |
3381 | if (param->sched_priority < 0 || | 3520 | if (param->sched_priority < 0 || |
3382 | param->sched_priority > MAX_USER_RT_PRIO-1) | 3521 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
3522 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | ||
3383 | return -EINVAL; | 3523 | return -EINVAL; |
3384 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3524 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) |
3385 | return -EINVAL; | 3525 | return -EINVAL; |
3386 | 3526 | ||
3387 | if ((policy == SCHED_FIFO || policy == SCHED_RR) && | 3527 | /* |
3388 | param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && | 3528 | * Allow unprivileged RT tasks to decrease priority: |
3389 | !capable(CAP_SYS_NICE)) | 3529 | */ |
3390 | return -EPERM; | 3530 | if (!capable(CAP_SYS_NICE)) { |
3391 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 3531 | /* can't change policy */ |
3392 | !capable(CAP_SYS_NICE)) | 3532 | if (policy != p->policy && |
3393 | return -EPERM; | 3533 | !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) |
3534 | return -EPERM; | ||
3535 | /* can't increase priority */ | ||
3536 | if (policy != SCHED_NORMAL && | ||
3537 | param->sched_priority > p->rt_priority && | ||
3538 | param->sched_priority > | ||
3539 | p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) | ||
3540 | return -EPERM; | ||
3541 | /* can't change other user's priorities */ | ||
3542 | if ((current->euid != p->euid) && | ||
3543 | (current->euid != p->uid)) | ||
3544 | return -EPERM; | ||
3545 | } | ||
3394 | 3546 | ||
3395 | retval = security_task_setscheduler(p, policy, param); | 3547 | retval = security_task_setscheduler(p, policy, param); |
3396 | if (retval) | 3548 | if (retval) |
@@ -3727,6 +3879,13 @@ asmlinkage long sys_sched_yield(void) | |||
3727 | 3879 | ||
3728 | static inline void __cond_resched(void) | 3880 | static inline void __cond_resched(void) |
3729 | { | 3881 | { |
3882 | /* | ||
3883 | * The BKS might be reacquired before we have dropped | ||
3884 | * PREEMPT_ACTIVE, which could trigger a second | ||
3885 | * cond_resched() call. | ||
3886 | */ | ||
3887 | if (unlikely(preempt_count())) | ||
3888 | return; | ||
3730 | do { | 3889 | do { |
3731 | add_preempt_count(PREEMPT_ACTIVE); | 3890 | add_preempt_count(PREEMPT_ACTIVE); |
3732 | schedule(); | 3891 | schedule(); |
@@ -4016,6 +4175,14 @@ void show_state(void) | |||
4016 | read_unlock(&tasklist_lock); | 4175 | read_unlock(&tasklist_lock); |
4017 | } | 4176 | } |
4018 | 4177 | ||
4178 | /** | ||
4179 | * init_idle - set up an idle thread for a given CPU | ||
4180 | * @idle: task in question | ||
4181 | * @cpu: cpu the idle task belongs to | ||
4182 | * | ||
4183 | * NOTE: this function does not set the idle thread's NEED_RESCHED | ||
4184 | * flag, to make booting more robust. | ||
4185 | */ | ||
4019 | void __devinit init_idle(task_t *idle, int cpu) | 4186 | void __devinit init_idle(task_t *idle, int cpu) |
4020 | { | 4187 | { |
4021 | runqueue_t *rq = cpu_rq(cpu); | 4188 | runqueue_t *rq = cpu_rq(cpu); |
@@ -4030,7 +4197,9 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4030 | 4197 | ||
4031 | spin_lock_irqsave(&rq->lock, flags); | 4198 | spin_lock_irqsave(&rq->lock, flags); |
4032 | rq->curr = rq->idle = idle; | 4199 | rq->curr = rq->idle = idle; |
4033 | set_tsk_need_resched(idle); | 4200 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
4201 | idle->oncpu = 1; | ||
4202 | #endif | ||
4034 | spin_unlock_irqrestore(&rq->lock, flags); | 4203 | spin_unlock_irqrestore(&rq->lock, flags); |
4035 | 4204 | ||
4036 | /* Set the preempt count _outside_ the spinlocks! */ | 4205 | /* Set the preempt count _outside_ the spinlocks! */ |
@@ -4174,8 +4343,7 @@ static int migration_thread(void * data) | |||
4174 | struct list_head *head; | 4343 | struct list_head *head; |
4175 | migration_req_t *req; | 4344 | migration_req_t *req; |
4176 | 4345 | ||
4177 | if (current->flags & PF_FREEZE) | 4346 | try_to_freeze(); |
4178 | refrigerator(PF_FREEZE); | ||
4179 | 4347 | ||
4180 | spin_lock_irq(&rq->lock); | 4348 | spin_lock_irq(&rq->lock); |
4181 | 4349 | ||
@@ -4200,17 +4368,9 @@ static int migration_thread(void * data) | |||
4200 | req = list_entry(head->next, migration_req_t, list); | 4368 | req = list_entry(head->next, migration_req_t, list); |
4201 | list_del_init(head->next); | 4369 | list_del_init(head->next); |
4202 | 4370 | ||
4203 | if (req->type == REQ_MOVE_TASK) { | 4371 | spin_unlock(&rq->lock); |
4204 | spin_unlock(&rq->lock); | 4372 | __migrate_task(req->task, cpu, req->dest_cpu); |
4205 | __migrate_task(req->task, cpu, req->dest_cpu); | 4373 | local_irq_enable(); |
4206 | local_irq_enable(); | ||
4207 | } else if (req->type == REQ_SET_DOMAIN) { | ||
4208 | rq->sd = req->sd; | ||
4209 | spin_unlock_irq(&rq->lock); | ||
4210 | } else { | ||
4211 | spin_unlock_irq(&rq->lock); | ||
4212 | WARN_ON(1); | ||
4213 | } | ||
4214 | 4374 | ||
4215 | complete(&req->done); | 4375 | complete(&req->done); |
4216 | } | 4376 | } |
@@ -4441,7 +4601,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4441 | migration_req_t *req; | 4601 | migration_req_t *req; |
4442 | req = list_entry(rq->migration_queue.next, | 4602 | req = list_entry(rq->migration_queue.next, |
4443 | migration_req_t, list); | 4603 | migration_req_t, list); |
4444 | BUG_ON(req->type != REQ_MOVE_TASK); | ||
4445 | list_del_init(&req->list); | 4604 | list_del_init(&req->list); |
4446 | complete(&req->done); | 4605 | complete(&req->done); |
4447 | } | 4606 | } |
@@ -4472,12 +4631,17 @@ int __init migration_init(void) | |||
4472 | #endif | 4631 | #endif |
4473 | 4632 | ||
4474 | #ifdef CONFIG_SMP | 4633 | #ifdef CONFIG_SMP |
4475 | #define SCHED_DOMAIN_DEBUG | 4634 | #undef SCHED_DOMAIN_DEBUG |
4476 | #ifdef SCHED_DOMAIN_DEBUG | 4635 | #ifdef SCHED_DOMAIN_DEBUG |
4477 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 4636 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
4478 | { | 4637 | { |
4479 | int level = 0; | 4638 | int level = 0; |
4480 | 4639 | ||
4640 | if (!sd) { | ||
4641 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | ||
4642 | return; | ||
4643 | } | ||
4644 | |||
4481 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 4645 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
4482 | 4646 | ||
4483 | do { | 4647 | do { |
@@ -4560,37 +4724,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
4560 | #define sched_domain_debug(sd, cpu) {} | 4724 | #define sched_domain_debug(sd, cpu) {} |
4561 | #endif | 4725 | #endif |
4562 | 4726 | ||
4727 | static int sd_degenerate(struct sched_domain *sd) | ||
4728 | { | ||
4729 | if (cpus_weight(sd->span) == 1) | ||
4730 | return 1; | ||
4731 | |||
4732 | /* Following flags need at least 2 groups */ | ||
4733 | if (sd->flags & (SD_LOAD_BALANCE | | ||
4734 | SD_BALANCE_NEWIDLE | | ||
4735 | SD_BALANCE_FORK | | ||
4736 | SD_BALANCE_EXEC)) { | ||
4737 | if (sd->groups != sd->groups->next) | ||
4738 | return 0; | ||
4739 | } | ||
4740 | |||
4741 | /* Following flags don't use groups */ | ||
4742 | if (sd->flags & (SD_WAKE_IDLE | | ||
4743 | SD_WAKE_AFFINE | | ||
4744 | SD_WAKE_BALANCE)) | ||
4745 | return 0; | ||
4746 | |||
4747 | return 1; | ||
4748 | } | ||
4749 | |||
4750 | static int sd_parent_degenerate(struct sched_domain *sd, | ||
4751 | struct sched_domain *parent) | ||
4752 | { | ||
4753 | unsigned long cflags = sd->flags, pflags = parent->flags; | ||
4754 | |||
4755 | if (sd_degenerate(parent)) | ||
4756 | return 1; | ||
4757 | |||
4758 | if (!cpus_equal(sd->span, parent->span)) | ||
4759 | return 0; | ||
4760 | |||
4761 | /* Does parent contain flags not in child? */ | ||
4762 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
4763 | if (cflags & SD_WAKE_AFFINE) | ||
4764 | pflags &= ~SD_WAKE_BALANCE; | ||
4765 | /* Flags needing groups don't count if only 1 group in parent */ | ||
4766 | if (parent->groups == parent->groups->next) { | ||
4767 | pflags &= ~(SD_LOAD_BALANCE | | ||
4768 | SD_BALANCE_NEWIDLE | | ||
4769 | SD_BALANCE_FORK | | ||
4770 | SD_BALANCE_EXEC); | ||
4771 | } | ||
4772 | if (~cflags & pflags) | ||
4773 | return 0; | ||
4774 | |||
4775 | return 1; | ||
4776 | } | ||
4777 | |||
4563 | /* | 4778 | /* |
4564 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4779 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4565 | * hold the hotplug lock. | 4780 | * hold the hotplug lock. |
4566 | */ | 4781 | */ |
4567 | void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) | 4782 | void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4568 | { | 4783 | { |
4569 | migration_req_t req; | ||
4570 | unsigned long flags; | ||
4571 | runqueue_t *rq = cpu_rq(cpu); | 4784 | runqueue_t *rq = cpu_rq(cpu); |
4572 | int local = 1; | 4785 | struct sched_domain *tmp; |
4573 | |||
4574 | sched_domain_debug(sd, cpu); | ||
4575 | |||
4576 | spin_lock_irqsave(&rq->lock, flags); | ||
4577 | 4786 | ||
4578 | if (cpu == smp_processor_id() || !cpu_online(cpu)) { | 4787 | /* Remove the sched domains which do not contribute to scheduling. */ |
4579 | rq->sd = sd; | 4788 | for (tmp = sd; tmp; tmp = tmp->parent) { |
4580 | } else { | 4789 | struct sched_domain *parent = tmp->parent; |
4581 | init_completion(&req.done); | 4790 | if (!parent) |
4582 | req.type = REQ_SET_DOMAIN; | 4791 | break; |
4583 | req.sd = sd; | 4792 | if (sd_parent_degenerate(tmp, parent)) |
4584 | list_add(&req.list, &rq->migration_queue); | 4793 | tmp->parent = parent->parent; |
4585 | local = 0; | ||
4586 | } | 4794 | } |
4587 | 4795 | ||
4588 | spin_unlock_irqrestore(&rq->lock, flags); | 4796 | if (sd && sd_degenerate(sd)) |
4797 | sd = sd->parent; | ||
4589 | 4798 | ||
4590 | if (!local) { | 4799 | sched_domain_debug(sd, cpu); |
4591 | wake_up_process(rq->migration_thread); | 4800 | |
4592 | wait_for_completion(&req.done); | 4801 | rcu_assign_pointer(rq->sd, sd); |
4593 | } | ||
4594 | } | 4802 | } |
4595 | 4803 | ||
4596 | /* cpus with isolated domains */ | 4804 | /* cpus with isolated domains */ |
@@ -4622,7 +4830,7 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
4622 | * covered by the given span, and will set each group's ->cpumask correctly, | 4830 | * covered by the given span, and will set each group's ->cpumask correctly, |
4623 | * and ->cpu_power to 0. | 4831 | * and ->cpu_power to 0. |
4624 | */ | 4832 | */ |
4625 | void __devinit init_sched_build_groups(struct sched_group groups[], | 4833 | void init_sched_build_groups(struct sched_group groups[], |
4626 | cpumask_t span, int (*group_fn)(int cpu)) | 4834 | cpumask_t span, int (*group_fn)(int cpu)) |
4627 | { | 4835 | { |
4628 | struct sched_group *first = NULL, *last = NULL; | 4836 | struct sched_group *first = NULL, *last = NULL; |
@@ -4658,13 +4866,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], | |||
4658 | 4866 | ||
4659 | 4867 | ||
4660 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4868 | #ifdef ARCH_HAS_SCHED_DOMAIN |
4661 | extern void __devinit arch_init_sched_domains(void); | 4869 | extern void build_sched_domains(const cpumask_t *cpu_map); |
4662 | extern void __devinit arch_destroy_sched_domains(void); | 4870 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); |
4871 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | ||
4663 | #else | 4872 | #else |
4664 | #ifdef CONFIG_SCHED_SMT | 4873 | #ifdef CONFIG_SCHED_SMT |
4665 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 4874 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
4666 | static struct sched_group sched_group_cpus[NR_CPUS]; | 4875 | static struct sched_group sched_group_cpus[NR_CPUS]; |
4667 | static int __devinit cpu_to_cpu_group(int cpu) | 4876 | static int cpu_to_cpu_group(int cpu) |
4668 | { | 4877 | { |
4669 | return cpu; | 4878 | return cpu; |
4670 | } | 4879 | } |
@@ -4672,7 +4881,7 @@ static int __devinit cpu_to_cpu_group(int cpu) | |||
4672 | 4881 | ||
4673 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 4882 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
4674 | static struct sched_group sched_group_phys[NR_CPUS]; | 4883 | static struct sched_group sched_group_phys[NR_CPUS]; |
4675 | static int __devinit cpu_to_phys_group(int cpu) | 4884 | static int cpu_to_phys_group(int cpu) |
4676 | { | 4885 | { |
4677 | #ifdef CONFIG_SCHED_SMT | 4886 | #ifdef CONFIG_SCHED_SMT |
4678 | return first_cpu(cpu_sibling_map[cpu]); | 4887 | return first_cpu(cpu_sibling_map[cpu]); |
@@ -4685,7 +4894,7 @@ static int __devinit cpu_to_phys_group(int cpu) | |||
4685 | 4894 | ||
4686 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 4895 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
4687 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | 4896 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; |
4688 | static int __devinit cpu_to_node_group(int cpu) | 4897 | static int cpu_to_node_group(int cpu) |
4689 | { | 4898 | { |
4690 | return cpu_to_node(cpu); | 4899 | return cpu_to_node(cpu); |
4691 | } | 4900 | } |
@@ -4716,39 +4925,28 @@ static void check_sibling_maps(void) | |||
4716 | #endif | 4925 | #endif |
4717 | 4926 | ||
4718 | /* | 4927 | /* |
4719 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 4928 | * Build sched domains for a given set of cpus and attach the sched domains |
4929 | * to the individual cpus | ||
4720 | */ | 4930 | */ |
4721 | static void __devinit arch_init_sched_domains(void) | 4931 | static void build_sched_domains(const cpumask_t *cpu_map) |
4722 | { | 4932 | { |
4723 | int i; | 4933 | int i; |
4724 | cpumask_t cpu_default_map; | ||
4725 | |||
4726 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4727 | check_sibling_maps(); | ||
4728 | #endif | ||
4729 | /* | ||
4730 | * Setup mask for cpus without special case scheduling requirements. | ||
4731 | * For now this just excludes isolated cpus, but could be used to | ||
4732 | * exclude other special cases in the future. | ||
4733 | */ | ||
4734 | cpus_complement(cpu_default_map, cpu_isolated_map); | ||
4735 | cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); | ||
4736 | 4934 | ||
4737 | /* | 4935 | /* |
4738 | * Set up domains. Isolated domains just stay on the dummy domain. | 4936 | * Set up domains for cpus specified by the cpu_map. |
4739 | */ | 4937 | */ |
4740 | for_each_cpu_mask(i, cpu_default_map) { | 4938 | for_each_cpu_mask(i, *cpu_map) { |
4741 | int group; | 4939 | int group; |
4742 | struct sched_domain *sd = NULL, *p; | 4940 | struct sched_domain *sd = NULL, *p; |
4743 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 4941 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
4744 | 4942 | ||
4745 | cpus_and(nodemask, nodemask, cpu_default_map); | 4943 | cpus_and(nodemask, nodemask, *cpu_map); |
4746 | 4944 | ||
4747 | #ifdef CONFIG_NUMA | 4945 | #ifdef CONFIG_NUMA |
4748 | sd = &per_cpu(node_domains, i); | 4946 | sd = &per_cpu(node_domains, i); |
4749 | group = cpu_to_node_group(i); | 4947 | group = cpu_to_node_group(i); |
4750 | *sd = SD_NODE_INIT; | 4948 | *sd = SD_NODE_INIT; |
4751 | sd->span = cpu_default_map; | 4949 | sd->span = *cpu_map; |
4752 | sd->groups = &sched_group_nodes[group]; | 4950 | sd->groups = &sched_group_nodes[group]; |
4753 | #endif | 4951 | #endif |
4754 | 4952 | ||
@@ -4766,7 +4964,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4766 | group = cpu_to_cpu_group(i); | 4964 | group = cpu_to_cpu_group(i); |
4767 | *sd = SD_SIBLING_INIT; | 4965 | *sd = SD_SIBLING_INIT; |
4768 | sd->span = cpu_sibling_map[i]; | 4966 | sd->span = cpu_sibling_map[i]; |
4769 | cpus_and(sd->span, sd->span, cpu_default_map); | 4967 | cpus_and(sd->span, sd->span, *cpu_map); |
4770 | sd->parent = p; | 4968 | sd->parent = p; |
4771 | sd->groups = &sched_group_cpus[group]; | 4969 | sd->groups = &sched_group_cpus[group]; |
4772 | #endif | 4970 | #endif |
@@ -4776,7 +4974,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4776 | /* Set up CPU (sibling) groups */ | 4974 | /* Set up CPU (sibling) groups */ |
4777 | for_each_online_cpu(i) { | 4975 | for_each_online_cpu(i) { |
4778 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 4976 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
4779 | cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); | 4977 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
4780 | if (i != first_cpu(this_sibling_map)) | 4978 | if (i != first_cpu(this_sibling_map)) |
4781 | continue; | 4979 | continue; |
4782 | 4980 | ||
@@ -4789,7 +4987,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4789 | for (i = 0; i < MAX_NUMNODES; i++) { | 4987 | for (i = 0; i < MAX_NUMNODES; i++) { |
4790 | cpumask_t nodemask = node_to_cpumask(i); | 4988 | cpumask_t nodemask = node_to_cpumask(i); |
4791 | 4989 | ||
4792 | cpus_and(nodemask, nodemask, cpu_default_map); | 4990 | cpus_and(nodemask, nodemask, *cpu_map); |
4793 | if (cpus_empty(nodemask)) | 4991 | if (cpus_empty(nodemask)) |
4794 | continue; | 4992 | continue; |
4795 | 4993 | ||
@@ -4799,12 +4997,12 @@ static void __devinit arch_init_sched_domains(void) | |||
4799 | 4997 | ||
4800 | #ifdef CONFIG_NUMA | 4998 | #ifdef CONFIG_NUMA |
4801 | /* Set up node groups */ | 4999 | /* Set up node groups */ |
4802 | init_sched_build_groups(sched_group_nodes, cpu_default_map, | 5000 | init_sched_build_groups(sched_group_nodes, *cpu_map, |
4803 | &cpu_to_node_group); | 5001 | &cpu_to_node_group); |
4804 | #endif | 5002 | #endif |
4805 | 5003 | ||
4806 | /* Calculate CPU power for physical packages and nodes */ | 5004 | /* Calculate CPU power for physical packages and nodes */ |
4807 | for_each_cpu_mask(i, cpu_default_map) { | 5005 | for_each_cpu_mask(i, *cpu_map) { |
4808 | int power; | 5006 | int power; |
4809 | struct sched_domain *sd; | 5007 | struct sched_domain *sd; |
4810 | #ifdef CONFIG_SCHED_SMT | 5008 | #ifdef CONFIG_SCHED_SMT |
@@ -4828,7 +5026,7 @@ static void __devinit arch_init_sched_domains(void) | |||
4828 | } | 5026 | } |
4829 | 5027 | ||
4830 | /* Attach the domains */ | 5028 | /* Attach the domains */ |
4831 | for_each_online_cpu(i) { | 5029 | for_each_cpu_mask(i, *cpu_map) { |
4832 | struct sched_domain *sd; | 5030 | struct sched_domain *sd; |
4833 | #ifdef CONFIG_SCHED_SMT | 5031 | #ifdef CONFIG_SCHED_SMT |
4834 | sd = &per_cpu(cpu_domains, i); | 5032 | sd = &per_cpu(cpu_domains, i); |
@@ -4838,41 +5036,85 @@ static void __devinit arch_init_sched_domains(void) | |||
4838 | cpu_attach_domain(sd, i); | 5036 | cpu_attach_domain(sd, i); |
4839 | } | 5037 | } |
4840 | } | 5038 | } |
5039 | /* | ||
5040 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | ||
5041 | */ | ||
5042 | static void arch_init_sched_domains(cpumask_t *cpu_map) | ||
5043 | { | ||
5044 | cpumask_t cpu_default_map; | ||
4841 | 5045 | ||
4842 | #ifdef CONFIG_HOTPLUG_CPU | 5046 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) |
4843 | static void __devinit arch_destroy_sched_domains(void) | 5047 | check_sibling_maps(); |
5048 | #endif | ||
5049 | /* | ||
5050 | * Setup mask for cpus without special case scheduling requirements. | ||
5051 | * For now this just excludes isolated cpus, but could be used to | ||
5052 | * exclude other special cases in the future. | ||
5053 | */ | ||
5054 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | ||
5055 | |||
5056 | build_sched_domains(&cpu_default_map); | ||
5057 | } | ||
5058 | |||
5059 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | ||
4844 | { | 5060 | { |
4845 | /* Do nothing: everything is statically allocated. */ | 5061 | /* Do nothing: everything is statically allocated. */ |
4846 | } | 5062 | } |
4847 | #endif | ||
4848 | 5063 | ||
4849 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5064 | #endif /* ARCH_HAS_SCHED_DOMAIN */ |
4850 | 5065 | ||
4851 | /* | 5066 | /* |
4852 | * Initial dummy domain for early boot and for hotplug cpu. Being static, | 5067 | * Detach sched domains from a group of cpus specified in cpu_map |
4853 | * it is initialized to zero, so all balancing flags are cleared which is | 5068 | * These cpus will now be attached to the NULL domain |
4854 | * what we want. | ||
4855 | */ | 5069 | */ |
4856 | static struct sched_domain sched_domain_dummy; | 5070 | static inline void detach_destroy_domains(const cpumask_t *cpu_map) |
5071 | { | ||
5072 | int i; | ||
5073 | |||
5074 | for_each_cpu_mask(i, *cpu_map) | ||
5075 | cpu_attach_domain(NULL, i); | ||
5076 | synchronize_sched(); | ||
5077 | arch_destroy_sched_domains(cpu_map); | ||
5078 | } | ||
5079 | |||
5080 | /* | ||
5081 | * Partition sched domains as specified by the cpumasks below. | ||
5082 | * This attaches all cpus from the cpumasks to the NULL domain, | ||
5083 | * waits for a RCU quiescent period, recalculates sched | ||
5084 | * domain information and then attaches them back to the | ||
5085 | * correct sched domains | ||
5086 | * Call with hotplug lock held | ||
5087 | */ | ||
5088 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | ||
5089 | { | ||
5090 | cpumask_t change_map; | ||
5091 | |||
5092 | cpus_and(*partition1, *partition1, cpu_online_map); | ||
5093 | cpus_and(*partition2, *partition2, cpu_online_map); | ||
5094 | cpus_or(change_map, *partition1, *partition2); | ||
5095 | |||
5096 | /* Detach sched domains from all of the affected cpus */ | ||
5097 | detach_destroy_domains(&change_map); | ||
5098 | if (!cpus_empty(*partition1)) | ||
5099 | build_sched_domains(partition1); | ||
5100 | if (!cpus_empty(*partition2)) | ||
5101 | build_sched_domains(partition2); | ||
5102 | } | ||
4857 | 5103 | ||
4858 | #ifdef CONFIG_HOTPLUG_CPU | 5104 | #ifdef CONFIG_HOTPLUG_CPU |
4859 | /* | 5105 | /* |
4860 | * Force a reinitialization of the sched domains hierarchy. The domains | 5106 | * Force a reinitialization of the sched domains hierarchy. The domains |
4861 | * and groups cannot be updated in place without racing with the balancing | 5107 | * and groups cannot be updated in place without racing with the balancing |
4862 | * code, so we temporarily attach all running cpus to a "dummy" domain | 5108 | * code, so we temporarily attach all running cpus to the NULL domain |
4863 | * which will prevent rebalancing while the sched domains are recalculated. | 5109 | * which will prevent rebalancing while the sched domains are recalculated. |
4864 | */ | 5110 | */ |
4865 | static int update_sched_domains(struct notifier_block *nfb, | 5111 | static int update_sched_domains(struct notifier_block *nfb, |
4866 | unsigned long action, void *hcpu) | 5112 | unsigned long action, void *hcpu) |
4867 | { | 5113 | { |
4868 | int i; | ||
4869 | |||
4870 | switch (action) { | 5114 | switch (action) { |
4871 | case CPU_UP_PREPARE: | 5115 | case CPU_UP_PREPARE: |
4872 | case CPU_DOWN_PREPARE: | 5116 | case CPU_DOWN_PREPARE: |
4873 | for_each_online_cpu(i) | 5117 | detach_destroy_domains(&cpu_online_map); |
4874 | cpu_attach_domain(&sched_domain_dummy, i); | ||
4875 | arch_destroy_sched_domains(); | ||
4876 | return NOTIFY_OK; | 5118 | return NOTIFY_OK; |
4877 | 5119 | ||
4878 | case CPU_UP_CANCELED: | 5120 | case CPU_UP_CANCELED: |
@@ -4888,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
4888 | } | 5130 | } |
4889 | 5131 | ||
4890 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 5132 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
4891 | arch_init_sched_domains(); | 5133 | arch_init_sched_domains(&cpu_online_map); |
4892 | 5134 | ||
4893 | return NOTIFY_OK; | 5135 | return NOTIFY_OK; |
4894 | } | 5136 | } |
@@ -4897,7 +5139,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
4897 | void __init sched_init_smp(void) | 5139 | void __init sched_init_smp(void) |
4898 | { | 5140 | { |
4899 | lock_cpu_hotplug(); | 5141 | lock_cpu_hotplug(); |
4900 | arch_init_sched_domains(); | 5142 | arch_init_sched_domains(&cpu_online_map); |
4901 | unlock_cpu_hotplug(); | 5143 | unlock_cpu_hotplug(); |
4902 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 5144 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
4903 | hotcpu_notifier(update_sched_domains, 0); | 5145 | hotcpu_notifier(update_sched_domains, 0); |
@@ -4927,13 +5169,15 @@ void __init sched_init(void) | |||
4927 | 5169 | ||
4928 | rq = cpu_rq(i); | 5170 | rq = cpu_rq(i); |
4929 | spin_lock_init(&rq->lock); | 5171 | spin_lock_init(&rq->lock); |
5172 | rq->nr_running = 0; | ||
4930 | rq->active = rq->arrays; | 5173 | rq->active = rq->arrays; |
4931 | rq->expired = rq->arrays + 1; | 5174 | rq->expired = rq->arrays + 1; |
4932 | rq->best_expired_prio = MAX_PRIO; | 5175 | rq->best_expired_prio = MAX_PRIO; |
4933 | 5176 | ||
4934 | #ifdef CONFIG_SMP | 5177 | #ifdef CONFIG_SMP |
4935 | rq->sd = &sched_domain_dummy; | 5178 | rq->sd = NULL; |
4936 | rq->cpu_load = 0; | 5179 | for (j = 1; j < 3; j++) |
5180 | rq->cpu_load[j] = 0; | ||
4937 | rq->active_balance = 0; | 5181 | rq->active_balance = 0; |
4938 | rq->push_cpu = 0; | 5182 | rq->push_cpu = 0; |
4939 | rq->migration_thread = NULL; | 5183 | rq->migration_thread = NULL; |
diff --git a/kernel/signal.c b/kernel/signal.c index c89821b69ae3..d282fea81138 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -213,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |||
213 | fastcall void recalc_sigpending_tsk(struct task_struct *t) | 213 | fastcall void recalc_sigpending_tsk(struct task_struct *t) |
214 | { | 214 | { |
215 | if (t->signal->group_stop_count > 0 || | 215 | if (t->signal->group_stop_count > 0 || |
216 | (freezing(t)) || | ||
216 | PENDING(&t->pending, &t->blocked) || | 217 | PENDING(&t->pending, &t->blocked) || |
217 | PENDING(&t->signal->shared_pending, &t->blocked)) | 218 | PENDING(&t->signal->shared_pending, &t->blocked)) |
218 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 219 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
@@ -691,7 +692,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
691 | { | 692 | { |
692 | struct task_struct *t; | 693 | struct task_struct *t; |
693 | 694 | ||
694 | if (p->flags & SIGNAL_GROUP_EXIT) | 695 | if (p->signal->flags & SIGNAL_GROUP_EXIT) |
695 | /* | 696 | /* |
696 | * The process is in the middle of dying already. | 697 | * The process is in the middle of dying already. |
697 | */ | 698 | */ |
@@ -2230,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
2230 | current->state = TASK_INTERRUPTIBLE; | 2231 | current->state = TASK_INTERRUPTIBLE; |
2231 | timeout = schedule_timeout(timeout); | 2232 | timeout = schedule_timeout(timeout); |
2232 | 2233 | ||
2233 | if (current->flags & PF_FREEZE) | 2234 | try_to_freeze(); |
2234 | refrigerator(PF_FREEZE); | ||
2235 | spin_lock_irq(¤t->sighand->siglock); | 2235 | spin_lock_irq(¤t->sighand->siglock); |
2236 | sig = dequeue_signal(current, &these, &info); | 2236 | sig = dequeue_signal(current, &these, &info); |
2237 | current->blocked = current->real_blocked; | 2237 | current->blocked = current->real_blocked; |
diff --git a/kernel/sys.c b/kernel/sys.c index f006632c2ba7..0bcaed6560ac 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -16,6 +16,8 @@ | |||
16 | #include <linux/init.h> | 16 | #include <linux/init.h> |
17 | #include <linux/highuid.h> | 17 | #include <linux/highuid.h> |
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/kernel.h> | ||
20 | #include <linux/kexec.h> | ||
19 | #include <linux/workqueue.h> | 21 | #include <linux/workqueue.h> |
20 | #include <linux/device.h> | 22 | #include <linux/device.h> |
21 | #include <linux/key.h> | 23 | #include <linux/key.h> |
@@ -359,6 +361,64 @@ out_unlock: | |||
359 | return retval; | 361 | return retval; |
360 | } | 362 | } |
361 | 363 | ||
364 | void emergency_restart(void) | ||
365 | { | ||
366 | machine_emergency_restart(); | ||
367 | } | ||
368 | EXPORT_SYMBOL_GPL(emergency_restart); | ||
369 | |||
370 | void kernel_restart(char *cmd) | ||
371 | { | ||
372 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | ||
373 | system_state = SYSTEM_RESTART; | ||
374 | device_shutdown(); | ||
375 | if (!cmd) { | ||
376 | printk(KERN_EMERG "Restarting system.\n"); | ||
377 | } else { | ||
378 | printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); | ||
379 | } | ||
380 | printk(".\n"); | ||
381 | machine_restart(cmd); | ||
382 | } | ||
383 | EXPORT_SYMBOL_GPL(kernel_restart); | ||
384 | |||
385 | void kernel_kexec(void) | ||
386 | { | ||
387 | #ifdef CONFIG_KEXEC | ||
388 | struct kimage *image; | ||
389 | image = xchg(&kexec_image, 0); | ||
390 | if (!image) { | ||
391 | return; | ||
392 | } | ||
393 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | ||
394 | system_state = SYSTEM_RESTART; | ||
395 | device_shutdown(); | ||
396 | printk(KERN_EMERG "Starting new kernel\n"); | ||
397 | machine_shutdown(); | ||
398 | machine_kexec(image); | ||
399 | #endif | ||
400 | } | ||
401 | EXPORT_SYMBOL_GPL(kernel_kexec); | ||
402 | |||
403 | void kernel_halt(void) | ||
404 | { | ||
405 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | ||
406 | system_state = SYSTEM_HALT; | ||
407 | device_shutdown(); | ||
408 | printk(KERN_EMERG "System halted.\n"); | ||
409 | machine_halt(); | ||
410 | } | ||
411 | EXPORT_SYMBOL_GPL(kernel_halt); | ||
412 | |||
413 | void kernel_power_off(void) | ||
414 | { | ||
415 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | ||
416 | system_state = SYSTEM_POWER_OFF; | ||
417 | device_shutdown(); | ||
418 | printk(KERN_EMERG "Power down.\n"); | ||
419 | machine_power_off(); | ||
420 | } | ||
421 | EXPORT_SYMBOL_GPL(kernel_power_off); | ||
362 | 422 | ||
363 | /* | 423 | /* |
364 | * Reboot system call: for obvious reasons only root may call it, | 424 | * Reboot system call: for obvious reasons only root may call it, |
@@ -387,11 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
387 | lock_kernel(); | 447 | lock_kernel(); |
388 | switch (cmd) { | 448 | switch (cmd) { |
389 | case LINUX_REBOOT_CMD_RESTART: | 449 | case LINUX_REBOOT_CMD_RESTART: |
390 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | 450 | kernel_restart(NULL); |
391 | system_state = SYSTEM_RESTART; | ||
392 | device_shutdown(); | ||
393 | printk(KERN_EMERG "Restarting system.\n"); | ||
394 | machine_restart(NULL); | ||
395 | break; | 451 | break; |
396 | 452 | ||
397 | case LINUX_REBOOT_CMD_CAD_ON: | 453 | case LINUX_REBOOT_CMD_CAD_ON: |
@@ -403,21 +459,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
403 | break; | 459 | break; |
404 | 460 | ||
405 | case LINUX_REBOOT_CMD_HALT: | 461 | case LINUX_REBOOT_CMD_HALT: |
406 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | 462 | kernel_halt(); |
407 | system_state = SYSTEM_HALT; | ||
408 | device_shutdown(); | ||
409 | printk(KERN_EMERG "System halted.\n"); | ||
410 | machine_halt(); | ||
411 | unlock_kernel(); | 463 | unlock_kernel(); |
412 | do_exit(0); | 464 | do_exit(0); |
413 | break; | 465 | break; |
414 | 466 | ||
415 | case LINUX_REBOOT_CMD_POWER_OFF: | 467 | case LINUX_REBOOT_CMD_POWER_OFF: |
416 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | 468 | kernel_power_off(); |
417 | system_state = SYSTEM_POWER_OFF; | ||
418 | device_shutdown(); | ||
419 | printk(KERN_EMERG "Power down.\n"); | ||
420 | machine_power_off(); | ||
421 | unlock_kernel(); | 469 | unlock_kernel(); |
422 | do_exit(0); | 470 | do_exit(0); |
423 | break; | 471 | break; |
@@ -429,13 +477,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
429 | } | 477 | } |
430 | buffer[sizeof(buffer) - 1] = '\0'; | 478 | buffer[sizeof(buffer) - 1] = '\0'; |
431 | 479 | ||
432 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); | 480 | kernel_restart(buffer); |
433 | system_state = SYSTEM_RESTART; | ||
434 | device_shutdown(); | ||
435 | printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); | ||
436 | machine_restart(buffer); | ||
437 | break; | 481 | break; |
438 | 482 | ||
483 | case LINUX_REBOOT_CMD_KEXEC: | ||
484 | kernel_kexec(); | ||
485 | unlock_kernel(); | ||
486 | return -EINVAL; | ||
487 | |||
439 | #ifdef CONFIG_SOFTWARE_SUSPEND | 488 | #ifdef CONFIG_SOFTWARE_SUSPEND |
440 | case LINUX_REBOOT_CMD_SW_SUSPEND: | 489 | case LINUX_REBOOT_CMD_SW_SUSPEND: |
441 | { | 490 | { |
@@ -455,8 +504,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
455 | 504 | ||
456 | static void deferred_cad(void *dummy) | 505 | static void deferred_cad(void *dummy) |
457 | { | 506 | { |
458 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | 507 | kernel_restart(NULL); |
459 | machine_restart(NULL); | ||
460 | } | 508 | } |
461 | 509 | ||
462 | /* | 510 | /* |
@@ -525,7 +573,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | |||
525 | } | 573 | } |
526 | if (new_egid != old_egid) | 574 | if (new_egid != old_egid) |
527 | { | 575 | { |
528 | current->mm->dumpable = 0; | 576 | current->mm->dumpable = suid_dumpable; |
529 | smp_wmb(); | 577 | smp_wmb(); |
530 | } | 578 | } |
531 | if (rgid != (gid_t) -1 || | 579 | if (rgid != (gid_t) -1 || |
@@ -556,7 +604,7 @@ asmlinkage long sys_setgid(gid_t gid) | |||
556 | { | 604 | { |
557 | if(old_egid != gid) | 605 | if(old_egid != gid) |
558 | { | 606 | { |
559 | current->mm->dumpable=0; | 607 | current->mm->dumpable = suid_dumpable; |
560 | smp_wmb(); | 608 | smp_wmb(); |
561 | } | 609 | } |
562 | current->gid = current->egid = current->sgid = current->fsgid = gid; | 610 | current->gid = current->egid = current->sgid = current->fsgid = gid; |
@@ -565,7 +613,7 @@ asmlinkage long sys_setgid(gid_t gid) | |||
565 | { | 613 | { |
566 | if(old_egid != gid) | 614 | if(old_egid != gid) |
567 | { | 615 | { |
568 | current->mm->dumpable=0; | 616 | current->mm->dumpable = suid_dumpable; |
569 | smp_wmb(); | 617 | smp_wmb(); |
570 | } | 618 | } |
571 | current->egid = current->fsgid = gid; | 619 | current->egid = current->fsgid = gid; |
@@ -596,7 +644,7 @@ static int set_user(uid_t new_ruid, int dumpclear) | |||
596 | 644 | ||
597 | if(dumpclear) | 645 | if(dumpclear) |
598 | { | 646 | { |
599 | current->mm->dumpable = 0; | 647 | current->mm->dumpable = suid_dumpable; |
600 | smp_wmb(); | 648 | smp_wmb(); |
601 | } | 649 | } |
602 | current->uid = new_ruid; | 650 | current->uid = new_ruid; |
@@ -653,7 +701,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
653 | 701 | ||
654 | if (new_euid != old_euid) | 702 | if (new_euid != old_euid) |
655 | { | 703 | { |
656 | current->mm->dumpable=0; | 704 | current->mm->dumpable = suid_dumpable; |
657 | smp_wmb(); | 705 | smp_wmb(); |
658 | } | 706 | } |
659 | current->fsuid = current->euid = new_euid; | 707 | current->fsuid = current->euid = new_euid; |
@@ -703,7 +751,7 @@ asmlinkage long sys_setuid(uid_t uid) | |||
703 | 751 | ||
704 | if (old_euid != uid) | 752 | if (old_euid != uid) |
705 | { | 753 | { |
706 | current->mm->dumpable = 0; | 754 | current->mm->dumpable = suid_dumpable; |
707 | smp_wmb(); | 755 | smp_wmb(); |
708 | } | 756 | } |
709 | current->fsuid = current->euid = uid; | 757 | current->fsuid = current->euid = uid; |
@@ -748,7 +796,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | |||
748 | if (euid != (uid_t) -1) { | 796 | if (euid != (uid_t) -1) { |
749 | if (euid != current->euid) | 797 | if (euid != current->euid) |
750 | { | 798 | { |
751 | current->mm->dumpable = 0; | 799 | current->mm->dumpable = suid_dumpable; |
752 | smp_wmb(); | 800 | smp_wmb(); |
753 | } | 801 | } |
754 | current->euid = euid; | 802 | current->euid = euid; |
@@ -798,7 +846,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | |||
798 | if (egid != (gid_t) -1) { | 846 | if (egid != (gid_t) -1) { |
799 | if (egid != current->egid) | 847 | if (egid != current->egid) |
800 | { | 848 | { |
801 | current->mm->dumpable = 0; | 849 | current->mm->dumpable = suid_dumpable; |
802 | smp_wmb(); | 850 | smp_wmb(); |
803 | } | 851 | } |
804 | current->egid = egid; | 852 | current->egid = egid; |
@@ -845,7 +893,7 @@ asmlinkage long sys_setfsuid(uid_t uid) | |||
845 | { | 893 | { |
846 | if (uid != old_fsuid) | 894 | if (uid != old_fsuid) |
847 | { | 895 | { |
848 | current->mm->dumpable = 0; | 896 | current->mm->dumpable = suid_dumpable; |
849 | smp_wmb(); | 897 | smp_wmb(); |
850 | } | 898 | } |
851 | current->fsuid = uid; | 899 | current->fsuid = uid; |
@@ -875,7 +923,7 @@ asmlinkage long sys_setfsgid(gid_t gid) | |||
875 | { | 923 | { |
876 | if (gid != old_fsgid) | 924 | if (gid != old_fsgid) |
877 | { | 925 | { |
878 | current->mm->dumpable = 0; | 926 | current->mm->dumpable = suid_dumpable; |
879 | smp_wmb(); | 927 | smp_wmb(); |
880 | } | 928 | } |
881 | current->fsgid = gid; | 929 | current->fsgid = gid; |
@@ -894,35 +942,69 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
894 | */ | 942 | */ |
895 | if (tbuf) { | 943 | if (tbuf) { |
896 | struct tms tmp; | 944 | struct tms tmp; |
897 | struct task_struct *tsk = current; | ||
898 | struct task_struct *t; | ||
899 | cputime_t utime, stime, cutime, cstime; | 945 | cputime_t utime, stime, cutime, cstime; |
900 | 946 | ||
901 | read_lock(&tasklist_lock); | 947 | #ifdef CONFIG_SMP |
902 | utime = tsk->signal->utime; | 948 | if (thread_group_empty(current)) { |
903 | stime = tsk->signal->stime; | 949 | /* |
904 | t = tsk; | 950 | * Single thread case without the use of any locks. |
905 | do { | 951 | * |
906 | utime = cputime_add(utime, t->utime); | 952 | * We may race with release_task if two threads are |
907 | stime = cputime_add(stime, t->stime); | 953 | * executing. However, release task first adds up the |
908 | t = next_thread(t); | 954 | * counters (__exit_signal) before removing the task |
909 | } while (t != tsk); | 955 | * from the process tasklist (__unhash_process). |
910 | 956 | * __exit_signal also acquires and releases the | |
911 | /* | 957 | * siglock which results in the proper memory ordering |
912 | * While we have tasklist_lock read-locked, no dying thread | 958 | * so that the list modifications are always visible |
913 | * can be updating current->signal->[us]time. Instead, | 959 | * after the counters have been updated. |
914 | * we got their counts included in the live thread loop. | 960 | * |
915 | * However, another thread can come in right now and | 961 | * If the counters have been updated by the second thread |
916 | * do a wait call that updates current->signal->c[us]time. | 962 | * but the thread has not yet been removed from the list |
917 | * To make sure we always see that pair updated atomically, | 963 | * then the other branch will be executing which will |
918 | * we take the siglock around fetching them. | 964 | * block on tasklist_lock until the exit handling of the |
919 | */ | 965 | * other task is finished. |
920 | spin_lock_irq(&tsk->sighand->siglock); | 966 | * |
921 | cutime = tsk->signal->cutime; | 967 | * This also implies that the sighand->siglock cannot |
922 | cstime = tsk->signal->cstime; | 968 | * be held by another processor. So we can also |
923 | spin_unlock_irq(&tsk->sighand->siglock); | 969 | * skip acquiring that lock. |
924 | read_unlock(&tasklist_lock); | 970 | */ |
971 | utime = cputime_add(current->signal->utime, current->utime); | ||
972 | stime = cputime_add(current->signal->utime, current->stime); | ||
973 | cutime = current->signal->cutime; | ||
974 | cstime = current->signal->cstime; | ||
975 | } else | ||
976 | #endif | ||
977 | { | ||
978 | |||
979 | /* Process with multiple threads */ | ||
980 | struct task_struct *tsk = current; | ||
981 | struct task_struct *t; | ||
925 | 982 | ||
983 | read_lock(&tasklist_lock); | ||
984 | utime = tsk->signal->utime; | ||
985 | stime = tsk->signal->stime; | ||
986 | t = tsk; | ||
987 | do { | ||
988 | utime = cputime_add(utime, t->utime); | ||
989 | stime = cputime_add(stime, t->stime); | ||
990 | t = next_thread(t); | ||
991 | } while (t != tsk); | ||
992 | |||
993 | /* | ||
994 | * While we have tasklist_lock read-locked, no dying thread | ||
995 | * can be updating current->signal->[us]time. Instead, | ||
996 | * we got their counts included in the live thread loop. | ||
997 | * However, another thread can come in right now and | ||
998 | * do a wait call that updates current->signal->c[us]time. | ||
999 | * To make sure we always see that pair updated atomically, | ||
1000 | * we take the siglock around fetching them. | ||
1001 | */ | ||
1002 | spin_lock_irq(&tsk->sighand->siglock); | ||
1003 | cutime = tsk->signal->cutime; | ||
1004 | cstime = tsk->signal->cstime; | ||
1005 | spin_unlock_irq(&tsk->sighand->siglock); | ||
1006 | read_unlock(&tasklist_lock); | ||
1007 | } | ||
926 | tmp.tms_utime = cputime_to_clock_t(utime); | 1008 | tmp.tms_utime = cputime_to_clock_t(utime); |
927 | tmp.tms_stime = cputime_to_clock_t(stime); | 1009 | tmp.tms_stime = cputime_to_clock_t(stime); |
928 | tmp.tms_cutime = cputime_to_clock_t(cutime); | 1010 | tmp.tms_cutime = cputime_to_clock_t(cutime); |
@@ -1225,7 +1307,7 @@ static void groups_sort(struct group_info *group_info) | |||
1225 | } | 1307 | } |
1226 | 1308 | ||
1227 | /* a simple bsearch */ | 1309 | /* a simple bsearch */ |
1228 | static int groups_search(struct group_info *group_info, gid_t grp) | 1310 | int groups_search(struct group_info *group_info, gid_t grp) |
1229 | { | 1311 | { |
1230 | int left, right; | 1312 | int left, right; |
1231 | 1313 | ||
@@ -1652,7 +1734,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1652 | error = 1; | 1734 | error = 1; |
1653 | break; | 1735 | break; |
1654 | case PR_SET_DUMPABLE: | 1736 | case PR_SET_DUMPABLE: |
1655 | if (arg2 != 0 && arg2 != 1) { | 1737 | if (arg2 < 0 || arg2 > 2) { |
1656 | error = -EINVAL; | 1738 | error = -EINVAL; |
1657 | break; | 1739 | break; |
1658 | } | 1740 | } |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 6f15bea7d1a8..1ab2370e2efa 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -18,6 +18,8 @@ cond_syscall(sys_acct); | |||
18 | cond_syscall(sys_lookup_dcookie); | 18 | cond_syscall(sys_lookup_dcookie); |
19 | cond_syscall(sys_swapon); | 19 | cond_syscall(sys_swapon); |
20 | cond_syscall(sys_swapoff); | 20 | cond_syscall(sys_swapoff); |
21 | cond_syscall(sys_kexec_load); | ||
22 | cond_syscall(compat_sys_kexec_load); | ||
21 | cond_syscall(sys_init_module); | 23 | cond_syscall(sys_init_module); |
22 | cond_syscall(sys_delete_module); | 24 | cond_syscall(sys_delete_module); |
23 | cond_syscall(sys_socketpair); | 25 | cond_syscall(sys_socketpair); |
@@ -77,7 +79,9 @@ cond_syscall(sys_request_key); | |||
77 | cond_syscall(sys_keyctl); | 79 | cond_syscall(sys_keyctl); |
78 | cond_syscall(compat_sys_keyctl); | 80 | cond_syscall(compat_sys_keyctl); |
79 | cond_syscall(compat_sys_socketcall); | 81 | cond_syscall(compat_sys_socketcall); |
80 | cond_syscall(sys_set_zone_reclaim); | 82 | cond_syscall(sys_inotify_init); |
83 | cond_syscall(sys_inotify_add_watch); | ||
84 | cond_syscall(sys_inotify_rm_watch); | ||
81 | 85 | ||
82 | /* arch-specific weak syscall entries */ | 86 | /* arch-specific weak syscall entries */ |
83 | cond_syscall(sys_pciconfig_read); | 87 | cond_syscall(sys_pciconfig_read); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 701d12c63068..3e0bbee549ea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio; | |||
58 | extern int max_threads; | 58 | extern int max_threads; |
59 | extern int sysrq_enabled; | 59 | extern int sysrq_enabled; |
60 | extern int core_uses_pid; | 60 | extern int core_uses_pid; |
61 | extern int suid_dumpable; | ||
61 | extern char core_pattern[]; | 62 | extern char core_pattern[]; |
62 | extern int cad_pid; | 63 | extern int cad_pid; |
63 | extern int pid_max; | 64 | extern int pid_max; |
@@ -113,6 +114,7 @@ extern int unaligned_enabled; | |||
113 | extern int sysctl_ieee_emulation_warnings; | 114 | extern int sysctl_ieee_emulation_warnings; |
114 | #endif | 115 | #endif |
115 | extern int sysctl_userprocess_debug; | 116 | extern int sysctl_userprocess_debug; |
117 | extern int spin_retry; | ||
116 | #endif | 118 | #endif |
117 | 119 | ||
118 | extern int sysctl_hz_timer; | 120 | extern int sysctl_hz_timer; |
@@ -145,6 +147,9 @@ extern ctl_table random_table[]; | |||
145 | #ifdef CONFIG_UNIX98_PTYS | 147 | #ifdef CONFIG_UNIX98_PTYS |
146 | extern ctl_table pty_table[]; | 148 | extern ctl_table pty_table[]; |
147 | #endif | 149 | #endif |
150 | #ifdef CONFIG_INOTIFY | ||
151 | extern ctl_table inotify_table[]; | ||
152 | #endif | ||
148 | 153 | ||
149 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 154 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
150 | int sysctl_legacy_va_layout; | 155 | int sysctl_legacy_va_layout; |
@@ -217,6 +222,7 @@ static ctl_table root_table[] = { | |||
217 | .mode = 0555, | 222 | .mode = 0555, |
218 | .child = dev_table, | 223 | .child = dev_table, |
219 | }, | 224 | }, |
225 | |||
220 | { .ctl_name = 0 } | 226 | { .ctl_name = 0 } |
221 | }; | 227 | }; |
222 | 228 | ||
@@ -642,7 +648,16 @@ static ctl_table kern_table[] = { | |||
642 | .mode = 0644, | 648 | .mode = 0644, |
643 | .proc_handler = &proc_dointvec, | 649 | .proc_handler = &proc_dointvec, |
644 | }, | 650 | }, |
645 | 651 | #if defined(CONFIG_ARCH_S390) | |
652 | { | ||
653 | .ctl_name = KERN_SPIN_RETRY, | ||
654 | .procname = "spin_retry", | ||
655 | .data = &spin_retry, | ||
656 | .maxlen = sizeof (int), | ||
657 | .mode = 0644, | ||
658 | .proc_handler = &proc_dointvec, | ||
659 | }, | ||
660 | #endif | ||
646 | { .ctl_name = 0 } | 661 | { .ctl_name = 0 } |
647 | }; | 662 | }; |
648 | 663 | ||
@@ -949,7 +964,23 @@ static ctl_table fs_table[] = { | |||
949 | .mode = 0644, | 964 | .mode = 0644, |
950 | .proc_handler = &proc_dointvec, | 965 | .proc_handler = &proc_dointvec, |
951 | }, | 966 | }, |
967 | #ifdef CONFIG_INOTIFY | ||
968 | { | ||
969 | .ctl_name = FS_INOTIFY, | ||
970 | .procname = "inotify", | ||
971 | .mode = 0555, | ||
972 | .child = inotify_table, | ||
973 | }, | ||
974 | #endif | ||
952 | #endif | 975 | #endif |
976 | { | ||
977 | .ctl_name = KERN_SETUID_DUMPABLE, | ||
978 | .procname = "suid_dumpable", | ||
979 | .data = &suid_dumpable, | ||
980 | .maxlen = sizeof(int), | ||
981 | .mode = 0644, | ||
982 | .proc_handler = &proc_dointvec, | ||
983 | }, | ||
953 | { .ctl_name = 0 } | 984 | { .ctl_name = 0 } |
954 | }; | 985 | }; |
955 | 986 | ||
@@ -959,7 +990,7 @@ static ctl_table debug_table[] = { | |||
959 | 990 | ||
960 | static ctl_table dev_table[] = { | 991 | static ctl_table dev_table[] = { |
961 | { .ctl_name = 0 } | 992 | { .ctl_name = 0 } |
962 | }; | 993 | }; |
963 | 994 | ||
964 | extern void init_irq_proc (void); | 995 | extern void init_irq_proc (void); |
965 | 996 | ||
@@ -991,8 +1022,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
991 | int error = parse_table(name, nlen, oldval, oldlenp, | 1022 | int error = parse_table(name, nlen, oldval, oldlenp, |
992 | newval, newlen, head->ctl_table, | 1023 | newval, newlen, head->ctl_table, |
993 | &context); | 1024 | &context); |
994 | if (context) | 1025 | kfree(context); |
995 | kfree(context); | ||
996 | if (error != -ENOTDIR) | 1026 | if (error != -ENOTDIR) |
997 | return error; | 1027 | return error; |
998 | tmp = tmp->next; | 1028 | tmp = tmp->next; |
diff --git a/kernel/time.c b/kernel/time.c index d4335c1c884c..dd5ae1162a8f 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us | |||
128 | * as real UNIX machines always do it. This avoids all headaches about | 128 | * as real UNIX machines always do it. This avoids all headaches about |
129 | * daylight saving times and warping kernel clocks. | 129 | * daylight saving times and warping kernel clocks. |
130 | */ | 130 | */ |
131 | inline static void warp_clock(void) | 131 | static inline void warp_clock(void) |
132 | { | 132 | { |
133 | write_seqlock_irq(&xtime_lock); | 133 | write_seqlock_irq(&xtime_lock); |
134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; | 134 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; |
diff --git a/kernel/timer.c b/kernel/timer.c index 207aa4f0aa10..5377f40723ff 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec); | |||
57 | #define TVN_MASK (TVN_SIZE - 1) | 57 | #define TVN_MASK (TVN_SIZE - 1) |
58 | #define TVR_MASK (TVR_SIZE - 1) | 58 | #define TVR_MASK (TVR_SIZE - 1) |
59 | 59 | ||
60 | struct timer_base_s { | ||
61 | spinlock_t lock; | ||
62 | struct timer_list *running_timer; | ||
63 | }; | ||
64 | |||
60 | typedef struct tvec_s { | 65 | typedef struct tvec_s { |
61 | struct list_head vec[TVN_SIZE]; | 66 | struct list_head vec[TVN_SIZE]; |
62 | } tvec_t; | 67 | } tvec_t; |
@@ -66,9 +71,8 @@ typedef struct tvec_root_s { | |||
66 | } tvec_root_t; | 71 | } tvec_root_t; |
67 | 72 | ||
68 | struct tvec_t_base_s { | 73 | struct tvec_t_base_s { |
69 | spinlock_t lock; | 74 | struct timer_base_s t_base; |
70 | unsigned long timer_jiffies; | 75 | unsigned long timer_jiffies; |
71 | struct timer_list *running_timer; | ||
72 | tvec_root_t tv1; | 76 | tvec_root_t tv1; |
73 | tvec_t tv2; | 77 | tvec_t tv2; |
74 | tvec_t tv3; | 78 | tvec_t tv3; |
@@ -77,18 +81,16 @@ struct tvec_t_base_s { | |||
77 | } ____cacheline_aligned_in_smp; | 81 | } ____cacheline_aligned_in_smp; |
78 | 82 | ||
79 | typedef struct tvec_t_base_s tvec_base_t; | 83 | typedef struct tvec_t_base_s tvec_base_t; |
84 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases); | ||
80 | 85 | ||
81 | static inline void set_running_timer(tvec_base_t *base, | 86 | static inline void set_running_timer(tvec_base_t *base, |
82 | struct timer_list *timer) | 87 | struct timer_list *timer) |
83 | { | 88 | { |
84 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
85 | base->running_timer = timer; | 90 | base->t_base.running_timer = timer; |
86 | #endif | 91 | #endif |
87 | } | 92 | } |
88 | 93 | ||
89 | /* Fake initialization */ | ||
90 | static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED }; | ||
91 | |||
92 | static void check_timer_failed(struct timer_list *timer) | 94 | static void check_timer_failed(struct timer_list *timer) |
93 | { | 95 | { |
94 | static int whine_count; | 96 | static int whine_count; |
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer) | |||
103 | /* | 105 | /* |
104 | * Now fix it up | 106 | * Now fix it up |
105 | */ | 107 | */ |
106 | spin_lock_init(&timer->lock); | ||
107 | timer->magic = TIMER_MAGIC; | 108 | timer->magic = TIMER_MAGIC; |
108 | } | 109 | } |
109 | 110 | ||
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | |||
156 | list_add_tail(&timer->entry, vec); | 157 | list_add_tail(&timer->entry, vec); |
157 | } | 158 | } |
158 | 159 | ||
160 | typedef struct timer_base_s timer_base_t; | ||
161 | /* | ||
162 | * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases) | ||
163 | * at compile time, and we need timer->base to lock the timer. | ||
164 | */ | ||
165 | timer_base_t __init_timer_base | ||
166 | ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED }; | ||
167 | EXPORT_SYMBOL(__init_timer_base); | ||
168 | |||
169 | /*** | ||
170 | * init_timer - initialize a timer. | ||
171 | * @timer: the timer to be initialized | ||
172 | * | ||
173 | * init_timer() must be done to a timer prior calling *any* of the | ||
174 | * other timer functions. | ||
175 | */ | ||
176 | void fastcall init_timer(struct timer_list *timer) | ||
177 | { | ||
178 | timer->entry.next = NULL; | ||
179 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | ||
180 | timer->magic = TIMER_MAGIC; | ||
181 | } | ||
182 | EXPORT_SYMBOL(init_timer); | ||
183 | |||
184 | static inline void detach_timer(struct timer_list *timer, | ||
185 | int clear_pending) | ||
186 | { | ||
187 | struct list_head *entry = &timer->entry; | ||
188 | |||
189 | __list_del(entry->prev, entry->next); | ||
190 | if (clear_pending) | ||
191 | entry->next = NULL; | ||
192 | entry->prev = LIST_POISON2; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock | ||
197 | * means that all timers which are tied to this base via timer->base are | ||
198 | * locked, and the base itself is locked too. | ||
199 | * | ||
200 | * So __run_timers/migrate_timers can safely modify all timers which could | ||
201 | * be found on ->tvX lists. | ||
202 | * | ||
203 | * When the timer's base is locked, and the timer removed from list, it is | ||
204 | * possible to set timer->base = NULL and drop the lock: the timer remains | ||
205 | * locked. | ||
206 | */ | ||
207 | static timer_base_t *lock_timer_base(struct timer_list *timer, | ||
208 | unsigned long *flags) | ||
209 | { | ||
210 | timer_base_t *base; | ||
211 | |||
212 | for (;;) { | ||
213 | base = timer->base; | ||
214 | if (likely(base != NULL)) { | ||
215 | spin_lock_irqsave(&base->lock, *flags); | ||
216 | if (likely(base == timer->base)) | ||
217 | return base; | ||
218 | /* The timer has migrated to another CPU */ | ||
219 | spin_unlock_irqrestore(&base->lock, *flags); | ||
220 | } | ||
221 | cpu_relax(); | ||
222 | } | ||
223 | } | ||
224 | |||
159 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 225 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
160 | { | 226 | { |
161 | tvec_base_t *old_base, *new_base; | 227 | timer_base_t *base; |
228 | tvec_base_t *new_base; | ||
162 | unsigned long flags; | 229 | unsigned long flags; |
163 | int ret = 0; | 230 | int ret = 0; |
164 | 231 | ||
165 | BUG_ON(!timer->function); | 232 | BUG_ON(!timer->function); |
166 | |||
167 | check_timer(timer); | 233 | check_timer(timer); |
168 | 234 | ||
169 | spin_lock_irqsave(&timer->lock, flags); | 235 | base = lock_timer_base(timer, &flags); |
236 | |||
237 | if (timer_pending(timer)) { | ||
238 | detach_timer(timer, 0); | ||
239 | ret = 1; | ||
240 | } | ||
241 | |||
170 | new_base = &__get_cpu_var(tvec_bases); | 242 | new_base = &__get_cpu_var(tvec_bases); |
171 | repeat: | ||
172 | old_base = timer->base; | ||
173 | 243 | ||
174 | /* | 244 | if (base != &new_base->t_base) { |
175 | * Prevent deadlocks via ordering by old_base < new_base. | ||
176 | */ | ||
177 | if (old_base && (new_base != old_base)) { | ||
178 | if (old_base < new_base) { | ||
179 | spin_lock(&new_base->lock); | ||
180 | spin_lock(&old_base->lock); | ||
181 | } else { | ||
182 | spin_lock(&old_base->lock); | ||
183 | spin_lock(&new_base->lock); | ||
184 | } | ||
185 | /* | 245 | /* |
186 | * The timer base might have been cancelled while we were | 246 | * We are trying to schedule the timer on the local CPU. |
187 | * trying to take the lock(s): | 247 | * However we can't change timer's base while it is running, |
248 | * otherwise del_timer_sync() can't detect that the timer's | ||
249 | * handler yet has not finished. This also guarantees that | ||
250 | * the timer is serialized wrt itself. | ||
188 | */ | 251 | */ |
189 | if (timer->base != old_base) { | 252 | if (unlikely(base->running_timer == timer)) { |
190 | spin_unlock(&new_base->lock); | 253 | /* The timer remains on a former base */ |
191 | spin_unlock(&old_base->lock); | 254 | new_base = container_of(base, tvec_base_t, t_base); |
192 | goto repeat; | 255 | } else { |
193 | } | 256 | /* See the comment in lock_timer_base() */ |
194 | } else { | 257 | timer->base = NULL; |
195 | spin_lock(&new_base->lock); | 258 | spin_unlock(&base->lock); |
196 | if (timer->base != old_base) { | 259 | spin_lock(&new_base->t_base.lock); |
197 | spin_unlock(&new_base->lock); | 260 | timer->base = &new_base->t_base; |
198 | goto repeat; | ||
199 | } | 261 | } |
200 | } | 262 | } |
201 | 263 | ||
202 | /* | ||
203 | * Delete the previous timeout (if there was any), and install | ||
204 | * the new one: | ||
205 | */ | ||
206 | if (old_base) { | ||
207 | list_del(&timer->entry); | ||
208 | ret = 1; | ||
209 | } | ||
210 | timer->expires = expires; | 264 | timer->expires = expires; |
211 | internal_add_timer(new_base, timer); | 265 | internal_add_timer(new_base, timer); |
212 | timer->base = new_base; | 266 | spin_unlock_irqrestore(&new_base->t_base.lock, flags); |
213 | |||
214 | if (old_base && (new_base != old_base)) | ||
215 | spin_unlock(&old_base->lock); | ||
216 | spin_unlock(&new_base->lock); | ||
217 | spin_unlock_irqrestore(&timer->lock, flags); | ||
218 | 267 | ||
219 | return ret; | 268 | return ret; |
220 | } | 269 | } |
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
232 | { | 281 | { |
233 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); | 282 | tvec_base_t *base = &per_cpu(tvec_bases, cpu); |
234 | unsigned long flags; | 283 | unsigned long flags; |
235 | 284 | ||
236 | BUG_ON(timer_pending(timer) || !timer->function); | 285 | BUG_ON(timer_pending(timer) || !timer->function); |
237 | 286 | ||
238 | check_timer(timer); | 287 | check_timer(timer); |
239 | 288 | ||
240 | spin_lock_irqsave(&base->lock, flags); | 289 | spin_lock_irqsave(&base->t_base.lock, flags); |
290 | timer->base = &base->t_base; | ||
241 | internal_add_timer(base, timer); | 291 | internal_add_timer(base, timer); |
242 | timer->base = base; | 292 | spin_unlock_irqrestore(&base->t_base.lock, flags); |
243 | spin_unlock_irqrestore(&base->lock, flags); | ||
244 | } | 293 | } |
245 | 294 | ||
246 | 295 | ||
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer); | |||
295 | */ | 344 | */ |
296 | int del_timer(struct timer_list *timer) | 345 | int del_timer(struct timer_list *timer) |
297 | { | 346 | { |
347 | timer_base_t *base; | ||
298 | unsigned long flags; | 348 | unsigned long flags; |
299 | tvec_base_t *base; | 349 | int ret = 0; |
300 | 350 | ||
301 | check_timer(timer); | 351 | check_timer(timer); |
302 | 352 | ||
303 | repeat: | 353 | if (timer_pending(timer)) { |
304 | base = timer->base; | 354 | base = lock_timer_base(timer, &flags); |
305 | if (!base) | 355 | if (timer_pending(timer)) { |
306 | return 0; | 356 | detach_timer(timer, 1); |
307 | spin_lock_irqsave(&base->lock, flags); | 357 | ret = 1; |
308 | if (base != timer->base) { | 358 | } |
309 | spin_unlock_irqrestore(&base->lock, flags); | 359 | spin_unlock_irqrestore(&base->lock, flags); |
310 | goto repeat; | ||
311 | } | 360 | } |
312 | list_del(&timer->entry); | ||
313 | /* Need to make sure that anybody who sees a NULL base also sees the list ops */ | ||
314 | smp_wmb(); | ||
315 | timer->base = NULL; | ||
316 | spin_unlock_irqrestore(&base->lock, flags); | ||
317 | 361 | ||
318 | return 1; | 362 | return ret; |
319 | } | 363 | } |
320 | 364 | ||
321 | EXPORT_SYMBOL(del_timer); | 365 | EXPORT_SYMBOL(del_timer); |
322 | 366 | ||
323 | #ifdef CONFIG_SMP | 367 | #ifdef CONFIG_SMP |
324 | /*** | 368 | /* |
325 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | 369 | * This function tries to deactivate a timer. Upon successful (ret >= 0) |
326 | * @timer: the timer to be deactivated | 370 | * exit the timer is not queued and the handler is not running on any CPU. |
327 | * | ||
328 | * This function only differs from del_timer() on SMP: besides deactivating | ||
329 | * the timer it also makes sure the handler has finished executing on other | ||
330 | * CPUs. | ||
331 | * | ||
332 | * Synchronization rules: callers must prevent restarting of the timer, | ||
333 | * otherwise this function is meaningless. It must not be called from | ||
334 | * interrupt contexts. The caller must not hold locks which would prevent | ||
335 | * completion of the timer's handler. Upon exit the timer is not queued and | ||
336 | * the handler is not running on any CPU. | ||
337 | * | ||
338 | * The function returns whether it has deactivated a pending timer or not. | ||
339 | * | 371 | * |
340 | * del_timer_sync() is slow and complicated because it copes with timer | 372 | * It must not be called from interrupt contexts. |
341 | * handlers which re-arm the timer (periodic timers). If the timer handler | ||
342 | * is known to not do this (a single shot timer) then use | ||
343 | * del_singleshot_timer_sync() instead. | ||
344 | */ | 373 | */ |
345 | int del_timer_sync(struct timer_list *timer) | 374 | int try_to_del_timer_sync(struct timer_list *timer) |
346 | { | 375 | { |
347 | tvec_base_t *base; | 376 | timer_base_t *base; |
348 | int i, ret = 0; | 377 | unsigned long flags; |
378 | int ret = -1; | ||
349 | 379 | ||
350 | check_timer(timer); | 380 | base = lock_timer_base(timer, &flags); |
351 | 381 | ||
352 | del_again: | 382 | if (base->running_timer == timer) |
353 | ret += del_timer(timer); | 383 | goto out; |
354 | 384 | ||
355 | for_each_online_cpu(i) { | 385 | ret = 0; |
356 | base = &per_cpu(tvec_bases, i); | 386 | if (timer_pending(timer)) { |
357 | if (base->running_timer == timer) { | 387 | detach_timer(timer, 1); |
358 | while (base->running_timer == timer) { | 388 | ret = 1; |
359 | cpu_relax(); | ||
360 | preempt_check_resched(); | ||
361 | } | ||
362 | break; | ||
363 | } | ||
364 | } | 389 | } |
365 | smp_rmb(); | 390 | out: |
366 | if (timer_pending(timer)) | 391 | spin_unlock_irqrestore(&base->lock, flags); |
367 | goto del_again; | ||
368 | 392 | ||
369 | return ret; | 393 | return ret; |
370 | } | 394 | } |
371 | EXPORT_SYMBOL(del_timer_sync); | ||
372 | 395 | ||
373 | /*** | 396 | /*** |
374 | * del_singleshot_timer_sync - deactivate a non-recursive timer | 397 | * del_timer_sync - deactivate a timer and wait for the handler to finish. |
375 | * @timer: the timer to be deactivated | 398 | * @timer: the timer to be deactivated |
376 | * | 399 | * |
377 | * This function is an optimization of del_timer_sync for the case where the | 400 | * This function only differs from del_timer() on SMP: besides deactivating |
378 | * caller can guarantee the timer does not reschedule itself in its timer | 401 | * the timer it also makes sure the handler has finished executing on other |
379 | * function. | 402 | * CPUs. |
380 | * | 403 | * |
381 | * Synchronization rules: callers must prevent restarting of the timer, | 404 | * Synchronization rules: callers must prevent restarting of the timer, |
382 | * otherwise this function is meaningless. It must not be called from | 405 | * otherwise this function is meaningless. It must not be called from |
383 | * interrupt contexts. The caller must not hold locks which wold prevent | 406 | * interrupt contexts. The caller must not hold locks which would prevent |
384 | * completion of the timer's handler. Upon exit the timer is not queued and | 407 | * completion of the timer's handler. The timer's handler must not call |
385 | * the handler is not running on any CPU. | 408 | * add_timer_on(). Upon exit the timer is not queued and the handler is |
409 | * not running on any CPU. | ||
386 | * | 410 | * |
387 | * The function returns whether it has deactivated a pending timer or not. | 411 | * The function returns whether it has deactivated a pending timer or not. |
388 | */ | 412 | */ |
389 | int del_singleshot_timer_sync(struct timer_list *timer) | 413 | int del_timer_sync(struct timer_list *timer) |
390 | { | 414 | { |
391 | int ret = del_timer(timer); | 415 | check_timer(timer); |
392 | 416 | ||
393 | if (!ret) { | 417 | for (;;) { |
394 | ret = del_timer_sync(timer); | 418 | int ret = try_to_del_timer_sync(timer); |
395 | BUG_ON(ret); | 419 | if (ret >= 0) |
420 | return ret; | ||
396 | } | 421 | } |
397 | |||
398 | return ret; | ||
399 | } | 422 | } |
400 | EXPORT_SYMBOL(del_singleshot_timer_sync); | 423 | |
424 | EXPORT_SYMBOL(del_timer_sync); | ||
401 | #endif | 425 | #endif |
402 | 426 | ||
403 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 427 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) |
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
415 | struct timer_list *tmp; | 439 | struct timer_list *tmp; |
416 | 440 | ||
417 | tmp = list_entry(curr, struct timer_list, entry); | 441 | tmp = list_entry(curr, struct timer_list, entry); |
418 | BUG_ON(tmp->base != base); | 442 | BUG_ON(tmp->base != &base->t_base); |
419 | curr = curr->next; | 443 | curr = curr->next; |
420 | internal_add_timer(base, tmp); | 444 | internal_add_timer(base, tmp); |
421 | } | 445 | } |
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
437 | { | 461 | { |
438 | struct timer_list *timer; | 462 | struct timer_list *timer; |
439 | 463 | ||
440 | spin_lock_irq(&base->lock); | 464 | spin_lock_irq(&base->t_base.lock); |
441 | while (time_after_eq(jiffies, base->timer_jiffies)) { | 465 | while (time_after_eq(jiffies, base->timer_jiffies)) { |
442 | struct list_head work_list = LIST_HEAD_INIT(work_list); | 466 | struct list_head work_list = LIST_HEAD_INIT(work_list); |
443 | struct list_head *head = &work_list; | 467 | struct list_head *head = &work_list; |
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
453 | cascade(base, &base->tv5, INDEX(3)); | 477 | cascade(base, &base->tv5, INDEX(3)); |
454 | ++base->timer_jiffies; | 478 | ++base->timer_jiffies; |
455 | list_splice_init(base->tv1.vec + index, &work_list); | 479 | list_splice_init(base->tv1.vec + index, &work_list); |
456 | repeat: | 480 | while (!list_empty(head)) { |
457 | if (!list_empty(head)) { | ||
458 | void (*fn)(unsigned long); | 481 | void (*fn)(unsigned long); |
459 | unsigned long data; | 482 | unsigned long data; |
460 | 483 | ||
@@ -462,25 +485,26 @@ repeat: | |||
462 | fn = timer->function; | 485 | fn = timer->function; |
463 | data = timer->data; | 486 | data = timer->data; |
464 | 487 | ||
465 | list_del(&timer->entry); | ||
466 | set_running_timer(base, timer); | 488 | set_running_timer(base, timer); |
467 | smp_wmb(); | 489 | detach_timer(timer, 1); |
468 | timer->base = NULL; | 490 | spin_unlock_irq(&base->t_base.lock); |
469 | spin_unlock_irq(&base->lock); | ||
470 | { | 491 | { |
471 | u32 preempt_count = preempt_count(); | 492 | int preempt_count = preempt_count(); |
472 | fn(data); | 493 | fn(data); |
473 | if (preempt_count != preempt_count()) { | 494 | if (preempt_count != preempt_count()) { |
474 | printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count()); | 495 | printk(KERN_WARNING "huh, entered %p " |
496 | "with preempt_count %08x, exited" | ||
497 | " with %08x?\n", | ||
498 | fn, preempt_count, | ||
499 | preempt_count()); | ||
475 | BUG(); | 500 | BUG(); |
476 | } | 501 | } |
477 | } | 502 | } |
478 | spin_lock_irq(&base->lock); | 503 | spin_lock_irq(&base->t_base.lock); |
479 | goto repeat; | ||
480 | } | 504 | } |
481 | } | 505 | } |
482 | set_running_timer(base, NULL); | 506 | set_running_timer(base, NULL); |
483 | spin_unlock_irq(&base->lock); | 507 | spin_unlock_irq(&base->t_base.lock); |
484 | } | 508 | } |
485 | 509 | ||
486 | #ifdef CONFIG_NO_IDLE_HZ | 510 | #ifdef CONFIG_NO_IDLE_HZ |
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void) | |||
499 | int i, j; | 523 | int i, j; |
500 | 524 | ||
501 | base = &__get_cpu_var(tvec_bases); | 525 | base = &__get_cpu_var(tvec_bases); |
502 | spin_lock(&base->lock); | 526 | spin_lock(&base->t_base.lock); |
503 | expires = base->timer_jiffies + (LONG_MAX >> 1); | 527 | expires = base->timer_jiffies + (LONG_MAX >> 1); |
504 | list = 0; | 528 | list = 0; |
505 | 529 | ||
@@ -547,7 +571,7 @@ found: | |||
547 | expires = nte->expires; | 571 | expires = nte->expires; |
548 | } | 572 | } |
549 | } | 573 | } |
550 | spin_unlock(&base->lock); | 574 | spin_unlock(&base->t_base.lock); |
551 | return expires; | 575 | return expires; |
552 | } | 576 | } |
553 | #endif | 577 | #endif |
@@ -999,7 +1023,7 @@ asmlinkage long sys_getppid(void) | |||
999 | parent = me->group_leader->real_parent; | 1023 | parent = me->group_leader->real_parent; |
1000 | for (;;) { | 1024 | for (;;) { |
1001 | pid = parent->tgid; | 1025 | pid = parent->tgid; |
1002 | #ifdef CONFIG_SMP | 1026 | #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) |
1003 | { | 1027 | { |
1004 | struct task_struct *old = parent; | 1028 | struct task_struct *old = parent; |
1005 | 1029 | ||
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu) | |||
1286 | { | 1310 | { |
1287 | int j; | 1311 | int j; |
1288 | tvec_base_t *base; | 1312 | tvec_base_t *base; |
1289 | 1313 | ||
1290 | base = &per_cpu(tvec_bases, cpu); | 1314 | base = &per_cpu(tvec_bases, cpu); |
1291 | spin_lock_init(&base->lock); | 1315 | spin_lock_init(&base->t_base.lock); |
1292 | for (j = 0; j < TVN_SIZE; j++) { | 1316 | for (j = 0; j < TVN_SIZE; j++) { |
1293 | INIT_LIST_HEAD(base->tv5.vec + j); | 1317 | INIT_LIST_HEAD(base->tv5.vec + j); |
1294 | INIT_LIST_HEAD(base->tv4.vec + j); | 1318 | INIT_LIST_HEAD(base->tv4.vec + j); |
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu) | |||
1302 | } | 1326 | } |
1303 | 1327 | ||
1304 | #ifdef CONFIG_HOTPLUG_CPU | 1328 | #ifdef CONFIG_HOTPLUG_CPU |
1305 | static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1329 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) |
1306 | { | 1330 | { |
1307 | struct timer_list *timer; | 1331 | struct timer_list *timer; |
1308 | 1332 | ||
1309 | while (!list_empty(head)) { | 1333 | while (!list_empty(head)) { |
1310 | timer = list_entry(head->next, struct timer_list, entry); | 1334 | timer = list_entry(head->next, struct timer_list, entry); |
1311 | /* We're locking backwards from __mod_timer order here, | 1335 | detach_timer(timer, 0); |
1312 | beware deadlock. */ | 1336 | timer->base = &new_base->t_base; |
1313 | if (!spin_trylock(&timer->lock)) | ||
1314 | return 0; | ||
1315 | list_del(&timer->entry); | ||
1316 | internal_add_timer(new_base, timer); | 1337 | internal_add_timer(new_base, timer); |
1317 | timer->base = new_base; | ||
1318 | spin_unlock(&timer->lock); | ||
1319 | } | 1338 | } |
1320 | return 1; | ||
1321 | } | 1339 | } |
1322 | 1340 | ||
1323 | static void __devinit migrate_timers(int cpu) | 1341 | static void __devinit migrate_timers(int cpu) |
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu) | |||
1331 | new_base = &get_cpu_var(tvec_bases); | 1349 | new_base = &get_cpu_var(tvec_bases); |
1332 | 1350 | ||
1333 | local_irq_disable(); | 1351 | local_irq_disable(); |
1334 | again: | 1352 | spin_lock(&new_base->t_base.lock); |
1335 | /* Prevent deadlocks via ordering by old_base < new_base. */ | 1353 | spin_lock(&old_base->t_base.lock); |
1336 | if (old_base < new_base) { | ||
1337 | spin_lock(&new_base->lock); | ||
1338 | spin_lock(&old_base->lock); | ||
1339 | } else { | ||
1340 | spin_lock(&old_base->lock); | ||
1341 | spin_lock(&new_base->lock); | ||
1342 | } | ||
1343 | 1354 | ||
1344 | if (old_base->running_timer) | 1355 | if (old_base->t_base.running_timer) |
1345 | BUG(); | 1356 | BUG(); |
1346 | for (i = 0; i < TVR_SIZE; i++) | 1357 | for (i = 0; i < TVR_SIZE; i++) |
1347 | if (!migrate_timer_list(new_base, old_base->tv1.vec + i)) | 1358 | migrate_timer_list(new_base, old_base->tv1.vec + i); |
1348 | goto unlock_again; | 1359 | for (i = 0; i < TVN_SIZE; i++) { |
1349 | for (i = 0; i < TVN_SIZE; i++) | 1360 | migrate_timer_list(new_base, old_base->tv2.vec + i); |
1350 | if (!migrate_timer_list(new_base, old_base->tv2.vec + i) | 1361 | migrate_timer_list(new_base, old_base->tv3.vec + i); |
1351 | || !migrate_timer_list(new_base, old_base->tv3.vec + i) | 1362 | migrate_timer_list(new_base, old_base->tv4.vec + i); |
1352 | || !migrate_timer_list(new_base, old_base->tv4.vec + i) | 1363 | migrate_timer_list(new_base, old_base->tv5.vec + i); |
1353 | || !migrate_timer_list(new_base, old_base->tv5.vec + i)) | 1364 | } |
1354 | goto unlock_again; | 1365 | |
1355 | spin_unlock(&old_base->lock); | 1366 | spin_unlock(&old_base->t_base.lock); |
1356 | spin_unlock(&new_base->lock); | 1367 | spin_unlock(&new_base->t_base.lock); |
1357 | local_irq_enable(); | 1368 | local_irq_enable(); |
1358 | put_cpu_var(tvec_bases); | 1369 | put_cpu_var(tvec_bases); |
1359 | return; | ||
1360 | |||
1361 | unlock_again: | ||
1362 | /* Avoid deadlock with __mod_timer, by backing off. */ | ||
1363 | spin_unlock(&old_base->lock); | ||
1364 | spin_unlock(&new_base->lock); | ||
1365 | cpu_relax(); | ||
1366 | goto again; | ||
1367 | } | 1370 | } |
1368 | #endif /* CONFIG_HOTPLUG_CPU */ | 1371 | #endif /* CONFIG_HOTPLUG_CPU */ |
1369 | 1372 | ||
@@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs) | |||
1594 | EXPORT_SYMBOL(msleep); | 1597 | EXPORT_SYMBOL(msleep); |
1595 | 1598 | ||
1596 | /** | 1599 | /** |
1597 | * msleep_interruptible - sleep waiting for waitqueue interruptions | 1600 | * msleep_interruptible - sleep waiting for signals |
1598 | * @msecs: Time in milliseconds to sleep for | 1601 | * @msecs: Time in milliseconds to sleep for |
1599 | */ | 1602 | */ |
1600 | unsigned long msleep_interruptible(unsigned int msecs) | 1603 | unsigned long msleep_interruptible(unsigned int msecs) |
diff --git a/kernel/user.c b/kernel/user.c index 734575d55769..89e562feb1b1 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -120,6 +120,10 @@ struct user_struct * alloc_uid(uid_t uid) | |||
120 | atomic_set(&new->processes, 0); | 120 | atomic_set(&new->processes, 0); |
121 | atomic_set(&new->files, 0); | 121 | atomic_set(&new->files, 0); |
122 | atomic_set(&new->sigpending, 0); | 122 | atomic_set(&new->sigpending, 0); |
123 | #ifdef CONFIG_INOTIFY | ||
124 | atomic_set(&new->inotify_watches, 0); | ||
125 | atomic_set(&new->inotify_devs, 0); | ||
126 | #endif | ||
123 | 127 | ||
124 | new->mq_bytes = 0; | 128 | new->mq_bytes = 0; |
125 | new->locked_shm = 0; | 129 | new->locked_shm = 0; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 259cf55da3c9..c7e36d4a70ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -308,8 +308,6 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
308 | struct workqueue_struct *wq; | 308 | struct workqueue_struct *wq; |
309 | struct task_struct *p; | 309 | struct task_struct *p; |
310 | 310 | ||
311 | BUG_ON(strlen(name) > 10); | ||
312 | |||
313 | wq = kmalloc(sizeof(*wq), GFP_KERNEL); | 311 | wq = kmalloc(sizeof(*wq), GFP_KERNEL); |
314 | if (!wq) | 312 | if (!wq) |
315 | return NULL; | 313 | return NULL; |