diff options
author | Paul Mackerras <paulus@samba.org> | 2005-10-30 21:37:12 -0500 |
---|---|---|
committer | Paul Mackerras <paulus@samba.org> | 2005-10-30 21:37:12 -0500 |
commit | 23fd07750a789a66fe88cf173d52a18f1a387da4 (patch) | |
tree | 06fdd6df35fdb835abdaa9b754d62f6b84b97250 /kernel | |
parent | bd787d438a59266af3c9f6351644c85ef1dd21fe (diff) | |
parent | ed28f96ac1960f30f818374d65be71d2fdf811b0 (diff) |
Merge ../linux-2.6 by hand
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acct.c | 2 | ||||
-rw-r--r-- | kernel/audit.c | 6 | ||||
-rw-r--r-- | kernel/auditsc.c | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 466 | ||||
-rw-r--r-- | kernel/exit.c | 34 | ||||
-rw-r--r-- | kernel/fork.c | 31 | ||||
-rw-r--r-- | kernel/futex.c | 6 | ||||
-rw-r--r-- | kernel/kallsyms.c | 1 | ||||
-rw-r--r-- | kernel/kexec.c | 11 | ||||
-rw-r--r-- | kernel/kmod.c | 6 | ||||
-rw-r--r-- | kernel/kprobes.c | 1 | ||||
-rw-r--r-- | kernel/kthread.c | 13 | ||||
-rw-r--r-- | kernel/params.c | 1 | ||||
-rw-r--r-- | kernel/posix-cpu-timers.c | 16 | ||||
-rw-r--r-- | kernel/posix-timers.c | 19 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/disk.c | 22 | ||||
-rw-r--r-- | kernel/power/main.c | 5 | ||||
-rw-r--r-- | kernel/power/power.h | 17 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 435 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 569 | ||||
-rw-r--r-- | kernel/printk.c | 78 | ||||
-rw-r--r-- | kernel/ptrace.c | 7 | ||||
-rw-r--r-- | kernel/rcupdate.c | 10 | ||||
-rw-r--r-- | kernel/rcutorture.c | 492 | ||||
-rw-r--r-- | kernel/sched.c | 3 | ||||
-rw-r--r-- | kernel/signal.c | 150 | ||||
-rw-r--r-- | kernel/time.c | 26 | ||||
-rw-r--r-- | kernel/timer.c | 337 | ||||
-rw-r--r-- | kernel/workqueue.c | 33 |
32 files changed, 1661 insertions, 1143 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce170..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o | |||
22 | obj-$(CONFIG_COMPAT) += compat.o | 22 | obj-$(CONFIG_COMPAT) += compat.o |
23 | obj-$(CONFIG_CPUSETS) += cpuset.o | 23 | obj-$(CONFIG_CPUSETS) += cpuset.o |
24 | obj-$(CONFIG_IKCONFIG) += configs.o | 24 | obj-$(CONFIG_IKCONFIG) += configs.o |
25 | obj-$(CONFIG_IKCONFIG_PROC) += configs.o | ||
26 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 25 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
27 | obj-$(CONFIG_AUDIT) += audit.o | 26 | obj-$(CONFIG_AUDIT) += audit.o |
28 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
@@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
32 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
33 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
34 | obj-$(CONFIG_SECCOMP) += seccomp.o | 33 | obj-$(CONFIG_SECCOMP) += seccomp.o |
34 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
35 | 35 | ||
36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497e..2e3f4a47e7d0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
553 | if (delta == 0) | 553 | if (delta == 0) |
554 | return; | 554 | return; |
555 | tsk->acct_stimexpd = tsk->stime; | 555 | tsk->acct_stimexpd = tsk->stime; |
556 | tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); | 556 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); |
557 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | 557 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; |
558 | } | 558 | } |
559 | } | 559 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index aefa73a8a586..0c56320d38dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -133,7 +133,7 @@ struct audit_buffer { | |||
133 | struct list_head list; | 133 | struct list_head list; |
134 | struct sk_buff *skb; /* formatted skb ready to send */ | 134 | struct sk_buff *skb; /* formatted skb ready to send */ |
135 | struct audit_context *ctx; /* NULL or associated context */ | 135 | struct audit_context *ctx; /* NULL or associated context */ |
136 | int gfp_mask; | 136 | gfp_t gfp_mask; |
137 | }; | 137 | }; |
138 | 138 | ||
139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
647 | * will be written at syscall exit. If there is no associated task, tsk | 647 | * will be written at syscall exit. If there is no associated task, tsk |
648 | * should be NULL. */ | 648 | * should be NULL. */ |
649 | 649 | ||
650 | struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, | 650 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, |
651 | int type) | 651 | int type) |
652 | { | 652 | { |
653 | struct audit_buffer *ab = NULL; | 653 | struct audit_buffer *ab = NULL; |
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
879 | /* Log an audit record. This is a convenience function that calls | 879 | /* Log an audit record. This is a convenience function that calls |
880 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 880 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be |
881 | * called in any context. */ | 881 | * called in any context. */ |
882 | void audit_log(struct audit_context *ctx, int gfp_mask, int type, | 882 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, |
883 | const char *fmt, ...) | 883 | const char *fmt, ...) |
884 | { | 884 | { |
885 | struct audit_buffer *ab; | 885 | struct audit_buffer *ab; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 88696f639aab..d8a68509e729 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab) | |||
803 | up_read(&mm->mmap_sem); | 803 | up_read(&mm->mmap_sem); |
804 | } | 804 | } |
805 | 805 | ||
806 | static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) | 806 | static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) |
807 | { | 807 | { |
808 | int i; | 808 | int i; |
809 | struct audit_buffer *ab; | 809 | struct audit_buffer *ab; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..3619e939182e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -17,6 +17,7 @@ | |||
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | DECLARE_MUTEX(cpucontrol); | 19 | DECLARE_MUTEX(cpucontrol); |
20 | EXPORT_SYMBOL_GPL(cpucontrol); | ||
20 | 21 | ||
21 | static struct notifier_block *cpu_chain; | 22 | static struct notifier_block *cpu_chain; |
22 | 23 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 28176d083f7b..5a737ed9dac7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
33 | #include <linux/kmod.h> | 33 | #include <linux/kmod.h> |
34 | #include <linux/list.h> | 34 | #include <linux/list.h> |
35 | #include <linux/mempolicy.h> | ||
35 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
36 | #include <linux/module.h> | 37 | #include <linux/module.h> |
37 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
@@ -60,6 +61,9 @@ struct cpuset { | |||
60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 61 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 62 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
62 | 63 | ||
64 | /* | ||
65 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
66 | */ | ||
63 | atomic_t count; /* count tasks using this cpuset */ | 67 | atomic_t count; /* count tasks using this cpuset */ |
64 | 68 | ||
65 | /* | 69 | /* |
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; | |||
142 | static struct super_block *cpuset_sb = NULL; | 146 | static struct super_block *cpuset_sb = NULL; |
143 | 147 | ||
144 | /* | 148 | /* |
145 | * cpuset_sem should be held by anyone who is depending on the children | 149 | * We have two global cpuset semaphores below. They can nest. |
146 | * or sibling lists of any cpuset, or performing non-atomic operations | 150 | * It is ok to first take manage_sem, then nest callback_sem. We also |
147 | * on the flags or *_allowed values of a cpuset, such as raising the | 151 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 152 | * See "The task_lock() exception", at the end of this comment. |
149 | * conditionally modifying the *_allowed values. One kernel global | 153 | * |
150 | * cpuset semaphore should be sufficient - these things don't change | 154 | * A task must hold both semaphores to modify cpusets. If a task |
151 | * that much. | 155 | * holds manage_sem, then it blocks others wanting that semaphore, |
152 | * | 156 | * ensuring that it is the only task able to also acquire callback_sem |
153 | * The code that modifies cpusets holds cpuset_sem across the entire | 157 | * and be able to modify cpusets. It can perform various checks on |
154 | * operation, from cpuset_common_file_write() down, single threading | 158 | * the cpuset structure first, knowing nothing will change. It can |
155 | * all cpuset modifications (except for counter manipulations from | 159 | * also allocate memory while just holding manage_sem. While it is |
156 | * fork and exit) across the system. This presumes that cpuset | 160 | * performing these checks, various callback routines can briefly |
157 | * modifications are rare - better kept simple and safe, even if slow. | 161 | * acquire callback_sem to query cpusets. Once it is ready to make |
158 | * | 162 | * the changes, it takes callback_sem, blocking everyone else. |
159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 163 | * |
160 | * and below, only holds cpuset_sem across small pieces of code, such | 164 | * Calls to the kernel memory allocator can not be made while holding |
161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 165 | * callback_sem, as that would risk double tripping on callback_sem |
162 | * the risks are less, and the desire for performance a little greater. | 166 | * from one of the callbacks into the cpuset code from within |
163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 167 | * __alloc_pages(). |
164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 168 | * |
165 | * | 169 | * If a task is only holding callback_sem, then it has read-only |
166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 170 | * access to cpusets. |
167 | * (usually) grab cpuset_sem. These are the two most performance | 171 | * |
168 | * critical pieces of code here. The exception occurs on exit(), | 172 | * The task_struct fields mems_allowed and mems_generation may only |
169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 173 | * be accessed in the context of that task, so require no locks. |
174 | * | ||
175 | * Any task can increment and decrement the count field without lock. | ||
176 | * So in general, code holding manage_sem or callback_sem can't rely | ||
177 | * on the count field not changing. However, if the count goes to | ||
178 | * zero, then only attach_task(), which holds both semaphores, can | ||
179 | * increment it again. Because a count of zero means that no tasks | ||
180 | * are currently attached, therefore there is no way a task attached | ||
181 | * to that cpuset can fork (the other way to increment the count). | ||
182 | * So code holding manage_sem or callback_sem can safely assume that | ||
183 | * if the count is zero, it will stay zero. Similarly, if a task | ||
184 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
185 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
186 | * both of those semaphores. | ||
187 | * | ||
188 | * A possible optimization to improve parallelism would be to make | ||
189 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
190 | * to proceed in parallel, with read access, until the holder of | ||
191 | * manage_sem needed to take this rwsem for exclusive write access | ||
192 | * and modify some cpusets. | ||
193 | * | ||
194 | * The cpuset_common_file_write handler for operations that modify | ||
195 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
196 | * single threading all such cpuset modifications across the system. | ||
197 | * | ||
198 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
199 | * small pieces of code, such as when reading out possibly multi-word | ||
200 | * cpumasks and nodemasks. | ||
201 | * | ||
202 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
203 | * (usually) take either semaphore. These are the two most performance | ||
204 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
205 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
170 | * is taken, and if the cpuset count is zero, a usermode call made | 206 | * is taken, and if the cpuset count is zero, a usermode call made |
171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 207 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
172 | * relative to the root of cpuset file system) as the argument. | 208 | * relative to the root of cpuset file system) as the argument. |
173 | * | 209 | * |
174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 210 | * A cpuset can only be deleted if both its 'count' of using tasks |
175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 211 | * is zero, and its list of 'children' cpusets is empty. Since all |
176 | * in the system use _some_ cpuset, and since there is always at least | 212 | * tasks in the system use _some_ cpuset, and since there is always at |
177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 213 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
178 | * always has either children cpusets and/or using tasks. So no need | 214 | * always has either children cpusets and/or using tasks. So we don't |
179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 215 | * need a special hack to ensure that top_cpuset cannot be deleted. |
216 | * | ||
217 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
218 | * | ||
219 | * The task_lock() exception | ||
220 | * | ||
221 | * The need for this exception arises from the action of attach_task(), | ||
222 | * which overwrites one tasks cpuset pointer with another. It does | ||
223 | * so using both semaphores, however there are several performance | ||
224 | * critical places that need to reference task->cpuset without the | ||
225 | * expense of grabbing a system global semaphore. Therefore except as | ||
226 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
228 | * (task->alloc_lock) already in the task_struct routinely used for | ||
229 | * such matters. | ||
180 | */ | 230 | */ |
181 | 231 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 232 | static DECLARE_MUTEX(manage_sem); |
183 | static struct task_struct *cpuset_sem_owner; | 233 | static DECLARE_MUTEX(callback_sem); |
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
219 | 234 | ||
220 | /* | 235 | /* |
221 | * A couple of forward declarations required, due to cyclic reference loop: | 236 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
390 | } | 405 | } |
391 | 406 | ||
392 | /* | 407 | /* |
393 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 408 | * Call with manage_sem held. Writes path of cpuset into buf. |
394 | * Returns 0 on success, -errno on error. | 409 | * Returns 0 on success, -errno on error. |
395 | */ | 410 | */ |
396 | 411 | ||
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
442 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 457 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
443 | * our caller up for that. | 458 | * our caller up for that. |
444 | * | 459 | * |
445 | * The simple act of forking that task might require more memory, | 460 | * When we had only one cpuset semaphore, we had to call this |
446 | * which might need cpuset_sem. So this routine must be called while | 461 | * without holding it, to avoid deadlock when call_usermodehelper() |
447 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 462 | * allocated memory. With two locks, we could now call this while |
448 | * comments for check_for_release(), below. | 463 | * holding manage_sem, but we still don't, so as to minimize |
464 | * the time manage_sem is held. | ||
449 | */ | 465 | */ |
450 | 466 | ||
451 | static void cpuset_release_agent(const char *pathbuf) | 467 | static void cpuset_release_agent(const char *pathbuf) |
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
477 | * cs is notify_on_release() and now both the user count is zero and | 493 | * cs is notify_on_release() and now both the user count is zero and |
478 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 494 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
479 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 495 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
480 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 496 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
481 | * Call here with cpuset_sem held. | 497 | * Call here with manage_sem held. |
482 | * | 498 | * |
483 | * This check_for_release() routine is responsible for kmalloc'ing | 499 | * This check_for_release() routine is responsible for kmalloc'ing |
484 | * pathbuf. The above cpuset_release_agent() is responsible for | 500 | * pathbuf. The above cpuset_release_agent() is responsible for |
485 | * kfree'ing pathbuf. The caller of these routines is responsible | 501 | * kfree'ing pathbuf. The caller of these routines is responsible |
486 | * for providing a pathbuf pointer, initialized to NULL, then | 502 | * for providing a pathbuf pointer, initialized to NULL, then |
487 | * calling check_for_release() with cpuset_sem held and the address | 503 | * calling check_for_release() with manage_sem held and the address |
488 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 504 | * of the pathbuf pointer, then dropping manage_sem, then calling |
489 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 505 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
490 | */ | 506 | */ |
491 | 507 | ||
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
516 | * One way or another, we guarantee to return some non-empty subset | 532 | * One way or another, we guarantee to return some non-empty subset |
517 | * of cpu_online_map. | 533 | * of cpu_online_map. |
518 | * | 534 | * |
519 | * Call with cpuset_sem held. | 535 | * Call with callback_sem held. |
520 | */ | 536 | */ |
521 | 537 | ||
522 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 538 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
540 | * One way or another, we guarantee to return some non-empty subset | 556 | * One way or another, we guarantee to return some non-empty subset |
541 | * of node_online_map. | 557 | * of node_online_map. |
542 | * | 558 | * |
543 | * Call with cpuset_sem held. | 559 | * Call with callback_sem held. |
544 | */ | 560 | */ |
545 | 561 | ||
546 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 562 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
555 | } | 571 | } |
556 | 572 | ||
557 | /* | 573 | /* |
558 | * Refresh current tasks mems_allowed and mems_generation from | 574 | * Refresh current tasks mems_allowed and mems_generation from current |
559 | * current tasks cpuset. Call with cpuset_sem held. | 575 | * tasks cpuset. |
560 | * | 576 | * |
561 | * This routine is needed to update the per-task mems_allowed | 577 | * Call without callback_sem or task_lock() held. May be called with |
562 | * data, within the tasks context, when it is trying to allocate | 578 | * or without manage_sem held. Will acquire task_lock() and might |
563 | * memory (in various mm/mempolicy.c routines) and notices | 579 | * acquire callback_sem during call. |
564 | * that some other task has been modifying its cpuset. | 580 | * |
581 | * The task_lock() is required to dereference current->cpuset safely. | ||
582 | * Without it, we could pick up the pointer value of current->cpuset | ||
583 | * in one instruction, and then attach_task could give us a different | ||
584 | * cpuset, and then the cpuset we had could be removed and freed, | ||
585 | * and then on our next instruction, we could dereference a no longer | ||
586 | * valid cpuset pointer to get its mems_generation field. | ||
587 | * | ||
588 | * This routine is needed to update the per-task mems_allowed data, | ||
589 | * within the tasks context, when it is trying to allocate memory | ||
590 | * (in various mm/mempolicy.c routines) and notices that some other | ||
591 | * task has been modifying its cpuset. | ||
565 | */ | 592 | */ |
566 | 593 | ||
567 | static void refresh_mems(void) | 594 | static void refresh_mems(void) |
568 | { | 595 | { |
569 | struct cpuset *cs = current->cpuset; | 596 | int my_cpusets_mem_gen; |
597 | |||
598 | task_lock(current); | ||
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
600 | task_unlock(current); | ||
570 | 601 | ||
571 | if (current->cpuset_mems_generation != cs->mems_generation) { | 602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { |
603 | struct cpuset *cs; | ||
604 | nodemask_t oldmem = current->mems_allowed; | ||
605 | |||
606 | down(&callback_sem); | ||
607 | task_lock(current); | ||
608 | cs = current->cpuset; | ||
572 | guarantee_online_mems(cs, ¤t->mems_allowed); | 609 | guarantee_online_mems(cs, ¤t->mems_allowed); |
573 | current->cpuset_mems_generation = cs->mems_generation; | 610 | current->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | ||
612 | up(&callback_sem); | ||
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | ||
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
574 | } | 615 | } |
575 | } | 616 | } |
576 | 617 | ||
@@ -579,7 +620,7 @@ static void refresh_mems(void) | |||
579 | * | 620 | * |
580 | * One cpuset is a subset of another if all its allowed CPUs and | 621 | * One cpuset is a subset of another if all its allowed CPUs and |
581 | * Memory Nodes are a subset of the other, and its exclusive flags | 622 | * Memory Nodes are a subset of the other, and its exclusive flags |
582 | * are only set if the other's are set. | 623 | * are only set if the other's are set. Call holding manage_sem. |
583 | */ | 624 | */ |
584 | 625 | ||
585 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 626 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
597 | * If we replaced the flag and mask values of the current cpuset | 638 | * If we replaced the flag and mask values of the current cpuset |
598 | * (cur) with those values in the trial cpuset (trial), would | 639 | * (cur) with those values in the trial cpuset (trial), would |
599 | * our various subset and exclusive rules still be valid? Presumes | 640 | * our various subset and exclusive rules still be valid? Presumes |
600 | * cpuset_sem held. | 641 | * manage_sem held. |
601 | * | 642 | * |
602 | * 'cur' is the address of an actual, in-use cpuset. Operations | 643 | * 'cur' is the address of an actual, in-use cpuset. Operations |
603 | * such as list traversal that depend on the actual address of the | 644 | * such as list traversal that depend on the actual address of the |
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
651 | * exclusive child cpusets | 692 | * exclusive child cpusets |
652 | * Build these two partitions by calling partition_sched_domains | 693 | * Build these two partitions by calling partition_sched_domains |
653 | * | 694 | * |
654 | * Call with cpuset_sem held. May nest a call to the | 695 | * Call with manage_sem held. May nest a call to the |
655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 696 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
656 | */ | 697 | */ |
657 | 698 | ||
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
696 | unlock_cpu_hotplug(); | 737 | unlock_cpu_hotplug(); |
697 | } | 738 | } |
698 | 739 | ||
740 | /* | ||
741 | * Call with manage_sem held. May take callback_sem during call. | ||
742 | */ | ||
743 | |||
699 | static int update_cpumask(struct cpuset *cs, char *buf) | 744 | static int update_cpumask(struct cpuset *cs, char *buf) |
700 | { | 745 | { |
701 | struct cpuset trialcs; | 746 | struct cpuset trialcs; |
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
712 | if (retval < 0) | 757 | if (retval < 0) |
713 | return retval; | 758 | return retval; |
714 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 759 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
760 | down(&callback_sem); | ||
715 | cs->cpus_allowed = trialcs.cpus_allowed; | 761 | cs->cpus_allowed = trialcs.cpus_allowed; |
762 | up(&callback_sem); | ||
716 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 763 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
717 | update_cpu_domains(cs); | 764 | update_cpu_domains(cs); |
718 | return 0; | 765 | return 0; |
719 | } | 766 | } |
720 | 767 | ||
768 | /* | ||
769 | * Call with manage_sem held. May take callback_sem during call. | ||
770 | */ | ||
771 | |||
721 | static int update_nodemask(struct cpuset *cs, char *buf) | 772 | static int update_nodemask(struct cpuset *cs, char *buf) |
722 | { | 773 | { |
723 | struct cpuset trialcs; | 774 | struct cpuset trialcs; |
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
732 | return -ENOSPC; | 783 | return -ENOSPC; |
733 | retval = validate_change(cs, &trialcs); | 784 | retval = validate_change(cs, &trialcs); |
734 | if (retval == 0) { | 785 | if (retval == 0) { |
786 | down(&callback_sem); | ||
735 | cs->mems_allowed = trialcs.mems_allowed; | 787 | cs->mems_allowed = trialcs.mems_allowed; |
736 | atomic_inc(&cpuset_mems_generation); | 788 | atomic_inc(&cpuset_mems_generation); |
737 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
790 | up(&callback_sem); | ||
738 | } | 791 | } |
739 | return retval; | 792 | return retval; |
740 | } | 793 | } |
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
745 | * CS_NOTIFY_ON_RELEASE) | 798 | * CS_NOTIFY_ON_RELEASE) |
746 | * cs: the cpuset to update | 799 | * cs: the cpuset to update |
747 | * buf: the buffer where we read the 0 or 1 | 800 | * buf: the buffer where we read the 0 or 1 |
801 | * | ||
802 | * Call with manage_sem held. | ||
748 | */ | 803 | */ |
749 | 804 | ||
750 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 805 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
766 | return err; | 821 | return err; |
767 | cpu_exclusive_changed = | 822 | cpu_exclusive_changed = |
768 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 823 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
824 | down(&callback_sem); | ||
769 | if (turning_on) | 825 | if (turning_on) |
770 | set_bit(bit, &cs->flags); | 826 | set_bit(bit, &cs->flags); |
771 | else | 827 | else |
772 | clear_bit(bit, &cs->flags); | 828 | clear_bit(bit, &cs->flags); |
829 | up(&callback_sem); | ||
773 | 830 | ||
774 | if (cpu_exclusive_changed) | 831 | if (cpu_exclusive_changed) |
775 | update_cpu_domains(cs); | 832 | update_cpu_domains(cs); |
776 | return 0; | 833 | return 0; |
777 | } | 834 | } |
778 | 835 | ||
836 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
839 | * notified on release. | ||
840 | * | ||
841 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
842 | * the task 'pid' during call. | ||
843 | */ | ||
844 | |||
779 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 845 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
780 | { | 846 | { |
781 | pid_t pid; | 847 | pid_t pid; |
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
792 | read_lock(&tasklist_lock); | 858 | read_lock(&tasklist_lock); |
793 | 859 | ||
794 | tsk = find_task_by_pid(pid); | 860 | tsk = find_task_by_pid(pid); |
795 | if (!tsk) { | 861 | if (!tsk || tsk->flags & PF_EXITING) { |
796 | read_unlock(&tasklist_lock); | 862 | read_unlock(&tasklist_lock); |
797 | return -ESRCH; | 863 | return -ESRCH; |
798 | } | 864 | } |
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
810 | get_task_struct(tsk); | 876 | get_task_struct(tsk); |
811 | } | 877 | } |
812 | 878 | ||
879 | down(&callback_sem); | ||
880 | |||
813 | task_lock(tsk); | 881 | task_lock(tsk); |
814 | oldcs = tsk->cpuset; | 882 | oldcs = tsk->cpuset; |
815 | if (!oldcs) { | 883 | if (!oldcs) { |
816 | task_unlock(tsk); | 884 | task_unlock(tsk); |
885 | up(&callback_sem); | ||
817 | put_task_struct(tsk); | 886 | put_task_struct(tsk); |
818 | return -ESRCH; | 887 | return -ESRCH; |
819 | } | 888 | } |
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
824 | guarantee_online_cpus(cs, &cpus); | 893 | guarantee_online_cpus(cs, &cpus); |
825 | set_cpus_allowed(tsk, cpus); | 894 | set_cpus_allowed(tsk, cpus); |
826 | 895 | ||
896 | up(&callback_sem); | ||
827 | put_task_struct(tsk); | 897 | put_task_struct(tsk); |
828 | if (atomic_dec_and_test(&oldcs->count)) | 898 | if (atomic_dec_and_test(&oldcs->count)) |
829 | check_for_release(oldcs, ppathbuf); | 899 | check_for_release(oldcs, ppathbuf); |
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
867 | } | 937 | } |
868 | buffer[nbytes] = 0; /* nul-terminate */ | 938 | buffer[nbytes] = 0; /* nul-terminate */ |
869 | 939 | ||
870 | cpuset_down(&cpuset_sem); | 940 | down(&manage_sem); |
871 | 941 | ||
872 | if (is_removed(cs)) { | 942 | if (is_removed(cs)) { |
873 | retval = -ENODEV; | 943 | retval = -ENODEV; |
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
901 | if (retval == 0) | 971 | if (retval == 0) |
902 | retval = nbytes; | 972 | retval = nbytes; |
903 | out2: | 973 | out2: |
904 | cpuset_up(&cpuset_sem); | 974 | up(&manage_sem); |
905 | cpuset_release_agent(pathbuf); | 975 | cpuset_release_agent(pathbuf); |
906 | out1: | 976 | out1: |
907 | kfree(buffer); | 977 | kfree(buffer); |
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
941 | { | 1011 | { |
942 | cpumask_t mask; | 1012 | cpumask_t mask; |
943 | 1013 | ||
944 | cpuset_down(&cpuset_sem); | 1014 | down(&callback_sem); |
945 | mask = cs->cpus_allowed; | 1015 | mask = cs->cpus_allowed; |
946 | cpuset_up(&cpuset_sem); | 1016 | up(&callback_sem); |
947 | 1017 | ||
948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1018 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
949 | } | 1019 | } |
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
952 | { | 1022 | { |
953 | nodemask_t mask; | 1023 | nodemask_t mask; |
954 | 1024 | ||
955 | cpuset_down(&cpuset_sem); | 1025 | down(&callback_sem); |
956 | mask = cs->mems_allowed; | 1026 | mask = cs->mems_allowed; |
957 | cpuset_up(&cpuset_sem); | 1027 | up(&callback_sem); |
958 | 1028 | ||
959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1029 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
960 | } | 1030 | } |
@@ -995,7 +1065,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
995 | goto out; | 1065 | goto out; |
996 | } | 1066 | } |
997 | *s++ = '\n'; | 1067 | *s++ = '\n'; |
998 | *s = '\0'; | ||
999 | 1068 | ||
1000 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); | 1069 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); |
1001 | out: | 1070 | out: |
@@ -1048,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) | |||
1048 | return 0; | 1117 | return 0; |
1049 | } | 1118 | } |
1050 | 1119 | ||
1120 | /* | ||
1121 | * cpuset_rename - Only allow simple rename of directories in place. | ||
1122 | */ | ||
1123 | static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
1124 | struct inode *new_dir, struct dentry *new_dentry) | ||
1125 | { | ||
1126 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||
1127 | return -ENOTDIR; | ||
1128 | if (new_dentry->d_inode) | ||
1129 | return -EEXIST; | ||
1130 | if (old_dir != new_dir) | ||
1131 | return -EIO; | ||
1132 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
1133 | } | ||
1134 | |||
1051 | static struct file_operations cpuset_file_operations = { | 1135 | static struct file_operations cpuset_file_operations = { |
1052 | .read = cpuset_file_read, | 1136 | .read = cpuset_file_read, |
1053 | .write = cpuset_file_write, | 1137 | .write = cpuset_file_write, |
@@ -1060,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { | |||
1060 | .lookup = simple_lookup, | 1144 | .lookup = simple_lookup, |
1061 | .mkdir = cpuset_mkdir, | 1145 | .mkdir = cpuset_mkdir, |
1062 | .rmdir = cpuset_rmdir, | 1146 | .rmdir = cpuset_rmdir, |
1147 | .rename = cpuset_rename, | ||
1063 | }; | 1148 | }; |
1064 | 1149 | ||
1065 | static int cpuset_create_file(struct dentry *dentry, int mode) | 1150 | static int cpuset_create_file(struct dentry *dentry, int mode) |
@@ -1163,7 +1248,9 @@ struct ctr_struct { | |||
1163 | 1248 | ||
1164 | /* | 1249 | /* |
1165 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1250 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
1166 | * Return actual number of pids loaded. | 1251 | * Return actual number of pids loaded. No need to task_lock(p) |
1252 | * when reading out p->cpuset, as we don't really care if it changes | ||
1253 | * on the next cycle, and we are not going to try to dereference it. | ||
1167 | */ | 1254 | */ |
1168 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1169 | { | 1256 | { |
@@ -1205,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1205 | return cnt; | 1292 | return cnt; |
1206 | } | 1293 | } |
1207 | 1294 | ||
1295 | /* | ||
1296 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
1297 | * process id's of tasks currently attached to the cpuset being opened. | ||
1298 | * | ||
1299 | * Does not require any specific cpuset semaphores, and does not take any. | ||
1300 | */ | ||
1208 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1301 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1209 | { | 1302 | { |
1210 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1303 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
@@ -1352,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1352 | if (!cs) | 1445 | if (!cs) |
1353 | return -ENOMEM; | 1446 | return -ENOMEM; |
1354 | 1447 | ||
1355 | cpuset_down(&cpuset_sem); | 1448 | down(&manage_sem); |
1449 | refresh_mems(); | ||
1356 | cs->flags = 0; | 1450 | cs->flags = 0; |
1357 | if (notify_on_release(parent)) | 1451 | if (notify_on_release(parent)) |
1358 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1366,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1366 | 1460 | ||
1367 | cs->parent = parent; | 1461 | cs->parent = parent; |
1368 | 1462 | ||
1463 | down(&callback_sem); | ||
1369 | list_add(&cs->sibling, &cs->parent->children); | 1464 | list_add(&cs->sibling, &cs->parent->children); |
1465 | up(&callback_sem); | ||
1370 | 1466 | ||
1371 | err = cpuset_create_dir(cs, name, mode); | 1467 | err = cpuset_create_dir(cs, name, mode); |
1372 | if (err < 0) | 1468 | if (err < 0) |
1373 | goto err; | 1469 | goto err; |
1374 | 1470 | ||
1375 | /* | 1471 | /* |
1376 | * Release cpuset_sem before cpuset_populate_dir() because it | 1472 | * Release manage_sem before cpuset_populate_dir() because it |
1377 | * will down() this new directory's i_sem and if we race with | 1473 | * will down() this new directory's i_sem and if we race with |
1378 | * another mkdir, we might deadlock. | 1474 | * another mkdir, we might deadlock. |
1379 | */ | 1475 | */ |
1380 | cpuset_up(&cpuset_sem); | 1476 | up(&manage_sem); |
1381 | 1477 | ||
1382 | err = cpuset_populate_dir(cs->dentry); | 1478 | err = cpuset_populate_dir(cs->dentry); |
1383 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1479 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1384 | return 0; | 1480 | return 0; |
1385 | err: | 1481 | err: |
1386 | list_del(&cs->sibling); | 1482 | list_del(&cs->sibling); |
1387 | cpuset_up(&cpuset_sem); | 1483 | up(&manage_sem); |
1388 | kfree(cs); | 1484 | kfree(cs); |
1389 | return err; | 1485 | return err; |
1390 | } | 1486 | } |
@@ -1406,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1406 | 1502 | ||
1407 | /* the vfs holds both inode->i_sem already */ | 1503 | /* the vfs holds both inode->i_sem already */ |
1408 | 1504 | ||
1409 | cpuset_down(&cpuset_sem); | 1505 | down(&manage_sem); |
1506 | refresh_mems(); | ||
1410 | if (atomic_read(&cs->count) > 0) { | 1507 | if (atomic_read(&cs->count) > 0) { |
1411 | cpuset_up(&cpuset_sem); | 1508 | up(&manage_sem); |
1412 | return -EBUSY; | 1509 | return -EBUSY; |
1413 | } | 1510 | } |
1414 | if (!list_empty(&cs->children)) { | 1511 | if (!list_empty(&cs->children)) { |
1415 | cpuset_up(&cpuset_sem); | 1512 | up(&manage_sem); |
1416 | return -EBUSY; | 1513 | return -EBUSY; |
1417 | } | 1514 | } |
1418 | parent = cs->parent; | 1515 | parent = cs->parent; |
1516 | down(&callback_sem); | ||
1419 | set_bit(CS_REMOVED, &cs->flags); | 1517 | set_bit(CS_REMOVED, &cs->flags); |
1420 | if (is_cpu_exclusive(cs)) | 1518 | if (is_cpu_exclusive(cs)) |
1421 | update_cpu_domains(cs); | 1519 | update_cpu_domains(cs); |
1422 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1520 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1423 | if (list_empty(&parent->children)) | ||
1424 | check_for_release(parent, &pathbuf); | ||
1425 | spin_lock(&cs->dentry->d_lock); | 1521 | spin_lock(&cs->dentry->d_lock); |
1426 | d = dget(cs->dentry); | 1522 | d = dget(cs->dentry); |
1427 | cs->dentry = NULL; | 1523 | cs->dentry = NULL; |
1428 | spin_unlock(&d->d_lock); | 1524 | spin_unlock(&d->d_lock); |
1429 | cpuset_d_remove_dir(d); | 1525 | cpuset_d_remove_dir(d); |
1430 | dput(d); | 1526 | dput(d); |
1431 | cpuset_up(&cpuset_sem); | 1527 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | ||
1529 | check_for_release(parent, &pathbuf); | ||
1530 | up(&manage_sem); | ||
1432 | cpuset_release_agent(pathbuf); | 1531 | cpuset_release_agent(pathbuf); |
1433 | return 0; | 1532 | return 0; |
1434 | } | 1533 | } |
@@ -1488,16 +1587,26 @@ void __init cpuset_init_smp(void) | |||
1488 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1587 | * cpuset_fork - attach newly forked task to its parents cpuset. |
1489 | * @tsk: pointer to task_struct of forking parent process. | 1588 | * @tsk: pointer to task_struct of forking parent process. |
1490 | * | 1589 | * |
1491 | * Description: By default, on fork, a task inherits its | 1590 | * Description: A task inherits its parent's cpuset at fork(). |
1492 | * parent's cpuset. The pointer to the shared cpuset is | 1591 | * |
1493 | * automatically copied in fork.c by dup_task_struct(). | 1592 | * A pointer to the shared cpuset was automatically copied in fork.c |
1494 | * This cpuset_fork() routine need only increment the usage | 1593 | * by dup_task_struct(). However, we ignore that copy, since it was |
1495 | * counter in that cpuset. | 1594 | * not made under the protection of task_lock(), so might no longer be |
1595 | * a valid cpuset pointer. attach_task() might have already changed | ||
1596 | * current->cpuset, allowing the previously referenced cpuset to | ||
1597 | * be removed and freed. Instead, we task_lock(current) and copy | ||
1598 | * its present value of current->cpuset for our freshly forked child. | ||
1599 | * | ||
1600 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
1601 | * task, and the passed argument 'child' points to the child task. | ||
1496 | **/ | 1602 | **/ |
1497 | 1603 | ||
1498 | void cpuset_fork(struct task_struct *tsk) | 1604 | void cpuset_fork(struct task_struct *child) |
1499 | { | 1605 | { |
1500 | atomic_inc(&tsk->cpuset->count); | 1606 | task_lock(current); |
1607 | child->cpuset = current->cpuset; | ||
1608 | atomic_inc(&child->cpuset->count); | ||
1609 | task_unlock(current); | ||
1501 | } | 1610 | } |
1502 | 1611 | ||
1503 | /** | 1612 | /** |
@@ -1506,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
1506 | * | 1615 | * |
1507 | * Description: Detach cpuset from @tsk and release it. | 1616 | * Description: Detach cpuset from @tsk and release it. |
1508 | * | 1617 | * |
1509 | * Note that cpusets marked notify_on_release force every task | 1618 | * Note that cpusets marked notify_on_release force every task in |
1510 | * in them to take the global cpuset_sem semaphore when exiting. | 1619 | * them to take the global manage_sem semaphore when exiting. |
1511 | * This could impact scaling on very large systems. Be reluctant | 1620 | * This could impact scaling on very large systems. Be reluctant to |
1512 | * to use notify_on_release cpusets where very high task exit | 1621 | * use notify_on_release cpusets where very high task exit scaling |
1513 | * scaling is required on large systems. | 1622 | * is required on large systems. |
1514 | * | 1623 | * |
1515 | * Don't even think about derefencing 'cs' after the cpuset use | 1624 | * Don't even think about derefencing 'cs' after the cpuset use count |
1516 | * count goes to zero, except inside a critical section guarded | 1625 | * goes to zero, except inside a critical section guarded by manage_sem |
1517 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1626 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
1518 | * then a zero cpuset use count is a license to any other task to | 1627 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
1519 | * nuke the cpuset immediately. | 1628 | * |
1629 | * This routine has to take manage_sem, not callback_sem, because | ||
1630 | * it is holding that semaphore while calling check_for_release(), | ||
1631 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
1632 | * | ||
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
1635 | * mess with it. | ||
1520 | **/ | 1636 | **/ |
1521 | 1637 | ||
1522 | void cpuset_exit(struct task_struct *tsk) | 1638 | void cpuset_exit(struct task_struct *tsk) |
1523 | { | 1639 | { |
1524 | struct cpuset *cs; | 1640 | struct cpuset *cs; |
1525 | 1641 | ||
1526 | task_lock(tsk); | 1642 | BUG_ON(!(tsk->flags & PF_EXITING)); |
1643 | |||
1527 | cs = tsk->cpuset; | 1644 | cs = tsk->cpuset; |
1528 | tsk->cpuset = NULL; | 1645 | tsk->cpuset = NULL; |
1529 | task_unlock(tsk); | ||
1530 | 1646 | ||
1531 | if (notify_on_release(cs)) { | 1647 | if (notify_on_release(cs)) { |
1532 | char *pathbuf = NULL; | 1648 | char *pathbuf = NULL; |
1533 | 1649 | ||
1534 | cpuset_down(&cpuset_sem); | 1650 | down(&manage_sem); |
1535 | if (atomic_dec_and_test(&cs->count)) | 1651 | if (atomic_dec_and_test(&cs->count)) |
1536 | check_for_release(cs, &pathbuf); | 1652 | check_for_release(cs, &pathbuf); |
1537 | cpuset_up(&cpuset_sem); | 1653 | up(&manage_sem); |
1538 | cpuset_release_agent(pathbuf); | 1654 | cpuset_release_agent(pathbuf); |
1539 | } else { | 1655 | } else { |
1540 | atomic_dec(&cs->count); | 1656 | atomic_dec(&cs->count); |
@@ -1555,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1555 | { | 1671 | { |
1556 | cpumask_t mask; | 1672 | cpumask_t mask; |
1557 | 1673 | ||
1558 | cpuset_down(&cpuset_sem); | 1674 | down(&callback_sem); |
1559 | task_lock((struct task_struct *)tsk); | 1675 | task_lock((struct task_struct *)tsk); |
1560 | guarantee_online_cpus(tsk->cpuset, &mask); | 1676 | guarantee_online_cpus(tsk->cpuset, &mask); |
1561 | task_unlock((struct task_struct *)tsk); | 1677 | task_unlock((struct task_struct *)tsk); |
1562 | cpuset_up(&cpuset_sem); | 1678 | up(&callback_sem); |
1563 | 1679 | ||
1564 | return mask; | 1680 | return mask; |
1565 | } | 1681 | } |
@@ -1575,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) | |||
1575 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1691 | * If the current tasks cpusets mems_allowed changed behind our backs, |
1576 | * update current->mems_allowed and mems_generation to the new value. | 1692 | * update current->mems_allowed and mems_generation to the new value. |
1577 | * Do not call this routine if in_interrupt(). | 1693 | * Do not call this routine if in_interrupt(). |
1694 | * | ||
1695 | * Call without callback_sem or task_lock() held. May be called | ||
1696 | * with or without manage_sem held. Unless exiting, it will acquire | ||
1697 | * task_lock(). Also might acquire callback_sem during call to | ||
1698 | * refresh_mems(). | ||
1578 | */ | 1699 | */ |
1579 | 1700 | ||
1580 | void cpuset_update_current_mems_allowed(void) | 1701 | void cpuset_update_current_mems_allowed(void) |
1581 | { | 1702 | { |
1582 | struct cpuset *cs = current->cpuset; | 1703 | struct cpuset *cs; |
1704 | int need_to_refresh = 0; | ||
1583 | 1705 | ||
1706 | task_lock(current); | ||
1707 | cs = current->cpuset; | ||
1584 | if (!cs) | 1708 | if (!cs) |
1585 | return; /* task is exiting */ | 1709 | goto done; |
1586 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1710 | if (current->cpuset_mems_generation != cs->mems_generation) |
1587 | cpuset_down(&cpuset_sem); | 1711 | need_to_refresh = 1; |
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1588 | refresh_mems(); | 1715 | refresh_mems(); |
1589 | cpuset_up(&cpuset_sem); | ||
1590 | } | ||
1591 | } | 1716 | } |
1592 | 1717 | ||
1593 | /** | 1718 | /** |
@@ -1621,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1621 | 1746 | ||
1622 | /* | 1747 | /* |
1623 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1748 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
1624 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1749 | * ancestor to the specified cpuset. Call holding callback_sem. |
1625 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1750 | * If no ancestor is mem_exclusive (an unusual configuration), then |
1626 | * returns the root cpuset. | 1751 | * returns the root cpuset. |
1627 | */ | 1752 | */ |
@@ -1648,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1648 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1773 | * GFP_KERNEL allocations are not so marked, so can escape to the |
1649 | * nearest mem_exclusive ancestor cpuset. | 1774 | * nearest mem_exclusive ancestor cpuset. |
1650 | * | 1775 | * |
1651 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1776 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
1652 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1777 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
1653 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1778 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
1654 | * mems_allowed came up empty on the first pass over the zonelist. | 1779 | * mems_allowed came up empty on the first pass over the zonelist. |
1655 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1780 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
1656 | * short of memory, might require taking the cpuset_sem semaphore. | 1781 | * short of memory, might require taking the callback_sem semaphore. |
1657 | * | 1782 | * |
1658 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1783 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
1659 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1784 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
@@ -1685,14 +1810,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1685 | return 0; | 1810 | return 0; |
1686 | 1811 | ||
1687 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1812 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1688 | cpuset_down(&cpuset_sem); | 1813 | down(&callback_sem); |
1689 | cs = current->cpuset; | 1814 | |
1690 | if (!cs) | 1815 | if (current->flags & PF_EXITING) /* Let dying task have memory */ |
1691 | goto done; /* current task exiting */ | 1816 | return 1; |
1692 | cs = nearest_exclusive_ancestor(cs); | 1817 | task_lock(current); |
1818 | cs = nearest_exclusive_ancestor(current->cpuset); | ||
1819 | task_unlock(current); | ||
1820 | |||
1693 | allowed = node_isset(node, cs->mems_allowed); | 1821 | allowed = node_isset(node, cs->mems_allowed); |
1694 | done: | 1822 | up(&callback_sem); |
1695 | cpuset_up(&cpuset_sem); | ||
1696 | return allowed; | 1823 | return allowed; |
1697 | } | 1824 | } |
1698 | 1825 | ||
@@ -1705,7 +1832,7 @@ done: | |||
1705 | * determine if task @p's memory usage might impact the memory | 1832 | * determine if task @p's memory usage might impact the memory |
1706 | * available to the current task. | 1833 | * available to the current task. |
1707 | * | 1834 | * |
1708 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1835 | * Acquires callback_sem - not suitable for calling from a fast path. |
1709 | **/ | 1836 | **/ |
1710 | 1837 | ||
1711 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1838 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1713,18 +1840,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1713 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1840 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1714 | int overlap = 0; /* do cpusets overlap? */ | 1841 | int overlap = 0; /* do cpusets overlap? */ |
1715 | 1842 | ||
1716 | cpuset_down(&cpuset_sem); | 1843 | down(&callback_sem); |
1717 | cs1 = current->cpuset; | 1844 | |
1718 | if (!cs1) | 1845 | task_lock(current); |
1719 | goto done; /* current task exiting */ | 1846 | if (current->flags & PF_EXITING) { |
1720 | cs2 = p->cpuset; | 1847 | task_unlock(current); |
1721 | if (!cs2) | 1848 | goto done; |
1722 | goto done; /* task p is exiting */ | 1849 | } |
1723 | cs1 = nearest_exclusive_ancestor(cs1); | 1850 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
1724 | cs2 = nearest_exclusive_ancestor(cs2); | 1851 | task_unlock(current); |
1852 | |||
1853 | task_lock((struct task_struct *)p); | ||
1854 | if (p->flags & PF_EXITING) { | ||
1855 | task_unlock((struct task_struct *)p); | ||
1856 | goto done; | ||
1857 | } | ||
1858 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
1859 | task_unlock((struct task_struct *)p); | ||
1860 | |||
1725 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1861 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1726 | done: | 1862 | done: |
1727 | cpuset_up(&cpuset_sem); | 1863 | up(&callback_sem); |
1728 | 1864 | ||
1729 | return overlap; | 1865 | return overlap; |
1730 | } | 1866 | } |
@@ -1733,6 +1869,10 @@ done: | |||
1733 | * proc_cpuset_show() | 1869 | * proc_cpuset_show() |
1734 | * - Print tasks cpuset path into seq_file. | 1870 | * - Print tasks cpuset path into seq_file. |
1735 | * - Used for /proc/<pid>/cpuset. | 1871 | * - Used for /proc/<pid>/cpuset. |
1872 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
1873 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
1874 | * and we take manage_sem, keeping attach_task() from changing it | ||
1875 | * anyway. | ||
1736 | */ | 1876 | */ |
1737 | 1877 | ||
1738 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1878 | static int proc_cpuset_show(struct seq_file *m, void *v) |
@@ -1747,10 +1887,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1747 | return -ENOMEM; | 1887 | return -ENOMEM; |
1748 | 1888 | ||
1749 | tsk = m->private; | 1889 | tsk = m->private; |
1750 | cpuset_down(&cpuset_sem); | 1890 | down(&manage_sem); |
1751 | task_lock(tsk); | ||
1752 | cs = tsk->cpuset; | 1891 | cs = tsk->cpuset; |
1753 | task_unlock(tsk); | ||
1754 | if (!cs) { | 1892 | if (!cs) { |
1755 | retval = -EINVAL; | 1893 | retval = -EINVAL; |
1756 | goto out; | 1894 | goto out; |
@@ -1762,7 +1900,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1762 | seq_puts(m, buf); | 1900 | seq_puts(m, buf); |
1763 | seq_putc(m, '\n'); | 1901 | seq_putc(m, '\n'); |
1764 | out: | 1902 | out: |
1765 | cpuset_up(&cpuset_sem); | 1903 | up(&manage_sem); |
1766 | kfree(buf); | 1904 | kfree(buf); |
1767 | return retval; | 1905 | return retval; |
1768 | } | 1906 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 3b25b182d2be..537394b25e8d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
547 | 547 | ||
548 | if (p->pdeath_signal) | 548 | if (p->pdeath_signal) |
549 | /* We already hold the tasklist_lock here. */ | 549 | /* We already hold the tasklist_lock here. */ |
550 | group_send_sig_info(p->pdeath_signal, (void *) 0, p); | 550 | group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); |
551 | 551 | ||
552 | /* Move the child from its dying parent to the new one. */ | 552 | /* Move the child from its dying parent to the new one. */ |
553 | if (unlikely(traced)) { | 553 | if (unlikely(traced)) { |
@@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
591 | int pgrp = process_group(p); | 591 | int pgrp = process_group(p); |
592 | 592 | ||
593 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 593 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { |
594 | __kill_pg_info(SIGHUP, (void *)1, pgrp); | 594 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
595 | __kill_pg_info(SIGCONT, (void *)1, pgrp); | 595 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
596 | } | 596 | } |
597 | } | 597 | } |
598 | } | 598 | } |
@@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk) | |||
727 | (t->signal->session == tsk->signal->session) && | 727 | (t->signal->session == tsk->signal->session) && |
728 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 728 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
729 | has_stopped_jobs(process_group(tsk))) { | 729 | has_stopped_jobs(process_group(tsk))) { |
730 | __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); | 730 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
731 | __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); | 731 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); |
732 | } | 732 | } |
733 | 733 | ||
734 | /* Let father know we died | 734 | /* Let father know we died |
@@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk) | |||
783 | /* If the process is dead, release it - nobody will wait for it */ | 783 | /* If the process is dead, release it - nobody will wait for it */ |
784 | if (state == EXIT_DEAD) | 784 | if (state == EXIT_DEAD) |
785 | release_task(tsk); | 785 | release_task(tsk); |
786 | |||
787 | /* PF_DEAD causes final put_task_struct after we schedule. */ | ||
788 | preempt_disable(); | ||
789 | tsk->flags |= PF_DEAD; | ||
790 | } | 786 | } |
791 | 787 | ||
792 | fastcall NORET_TYPE void do_exit(long code) | 788 | fastcall NORET_TYPE void do_exit(long code) |
@@ -839,7 +835,10 @@ fastcall NORET_TYPE void do_exit(long code) | |||
839 | preempt_count()); | 835 | preempt_count()); |
840 | 836 | ||
841 | acct_update_integrals(tsk); | 837 | acct_update_integrals(tsk); |
842 | update_mem_hiwater(tsk); | 838 | if (tsk->mm) { |
839 | update_hiwater_rss(tsk->mm); | ||
840 | update_hiwater_vm(tsk->mm); | ||
841 | } | ||
843 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 842 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
844 | if (group_dead) { | 843 | if (group_dead) { |
845 | del_timer_sync(&tsk->signal->real_timer); | 844 | del_timer_sync(&tsk->signal->real_timer); |
@@ -870,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code) | |||
870 | tsk->mempolicy = NULL; | 869 | tsk->mempolicy = NULL; |
871 | #endif | 870 | #endif |
872 | 871 | ||
873 | BUG_ON(!(current->flags & PF_DEAD)); | 872 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
873 | preempt_disable(); | ||
874 | BUG_ON(tsk->flags & PF_DEAD); | ||
875 | tsk->flags |= PF_DEAD; | ||
876 | |||
874 | schedule(); | 877 | schedule(); |
875 | BUG(); | 878 | BUG(); |
876 | /* Avoid "noreturn function does return". */ | 879 | /* Avoid "noreturn function does return". */ |
@@ -1380,6 +1383,15 @@ repeat: | |||
1380 | 1383 | ||
1381 | switch (p->state) { | 1384 | switch (p->state) { |
1382 | case TASK_TRACED: | 1385 | case TASK_TRACED: |
1386 | /* | ||
1387 | * When we hit the race with PTRACE_ATTACH, | ||
1388 | * we will not report this child. But the | ||
1389 | * race means it has not yet been moved to | ||
1390 | * our ptrace_children list, so we need to | ||
1391 | * set the flag here to avoid a spurious ECHILD | ||
1392 | * when the race happens with the only child. | ||
1393 | */ | ||
1394 | flag = 1; | ||
1383 | if (!my_ptrace_child(p)) | 1395 | if (!my_ptrace_child(p)) |
1384 | continue; | 1396 | continue; |
1385 | /*FALLTHROUGH*/ | 1397 | /*FALLTHROUGH*/ |
diff --git a/kernel/fork.c b/kernel/fork.c index 280bd44ac441..8a069612eac3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
182 | } | 182 | } |
183 | 183 | ||
184 | #ifdef CONFIG_MMU | 184 | #ifdef CONFIG_MMU |
185 | static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | 185 | static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
186 | { | 186 | { |
187 | struct vm_area_struct * mpnt, *tmp, **pprev; | 187 | struct vm_area_struct *mpnt, *tmp, **pprev; |
188 | struct rb_node **rb_link, *rb_parent; | 188 | struct rb_node **rb_link, *rb_parent; |
189 | int retval; | 189 | int retval; |
190 | unsigned long charge; | 190 | unsigned long charge; |
191 | struct mempolicy *pol; | 191 | struct mempolicy *pol; |
192 | 192 | ||
193 | down_write(&oldmm->mmap_sem); | 193 | down_write(&oldmm->mmap_sem); |
194 | flush_cache_mm(current->mm); | 194 | flush_cache_mm(oldmm); |
195 | down_write(&mm->mmap_sem); | ||
196 | |||
195 | mm->locked_vm = 0; | 197 | mm->locked_vm = 0; |
196 | mm->mmap = NULL; | 198 | mm->mmap = NULL; |
197 | mm->mmap_cache = NULL; | 199 | mm->mmap_cache = NULL; |
198 | mm->free_area_cache = oldmm->mmap_base; | 200 | mm->free_area_cache = oldmm->mmap_base; |
199 | mm->cached_hole_size = ~0UL; | 201 | mm->cached_hole_size = ~0UL; |
200 | mm->map_count = 0; | 202 | mm->map_count = 0; |
201 | set_mm_counter(mm, rss, 0); | ||
202 | set_mm_counter(mm, anon_rss, 0); | ||
203 | cpus_clear(mm->cpu_vm_mask); | 203 | cpus_clear(mm->cpu_vm_mask); |
204 | mm->mm_rb = RB_ROOT; | 204 | mm->mm_rb = RB_ROOT; |
205 | rb_link = &mm->mm_rb.rb_node; | 205 | rb_link = &mm->mm_rb.rb_node; |
206 | rb_parent = NULL; | 206 | rb_parent = NULL; |
207 | pprev = &mm->mmap; | 207 | pprev = &mm->mmap; |
208 | 208 | ||
209 | for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { | 209 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
210 | struct file *file; | 210 | struct file *file; |
211 | 211 | ||
212 | if (mpnt->vm_flags & VM_DONTCOPY) { | 212 | if (mpnt->vm_flags & VM_DONTCOPY) { |
213 | long pages = vma_pages(mpnt); | 213 | long pages = vma_pages(mpnt); |
214 | mm->total_vm -= pages; | 214 | mm->total_vm -= pages; |
215 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 215 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
216 | -pages); | 216 | -pages); |
217 | continue; | 217 | continue; |
218 | } | 218 | } |
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
253 | } | 253 | } |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * Link in the new vma and copy the page table entries: | 256 | * Link in the new vma and copy the page table entries. |
257 | * link in first so that swapoff can see swap entries. | ||
258 | * Note that, exceptionally, here the vma is inserted | ||
259 | * without holding mm->mmap_sem. | ||
260 | */ | 257 | */ |
261 | spin_lock(&mm->page_table_lock); | ||
262 | *pprev = tmp; | 258 | *pprev = tmp; |
263 | pprev = &tmp->vm_next; | 259 | pprev = &tmp->vm_next; |
264 | 260 | ||
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
267 | rb_parent = &tmp->vm_rb; | 263 | rb_parent = &tmp->vm_rb; |
268 | 264 | ||
269 | mm->map_count++; | 265 | mm->map_count++; |
270 | retval = copy_page_range(mm, current->mm, tmp); | 266 | retval = copy_page_range(mm, oldmm, tmp); |
271 | spin_unlock(&mm->page_table_lock); | ||
272 | 267 | ||
273 | if (tmp->vm_ops && tmp->vm_ops->open) | 268 | if (tmp->vm_ops && tmp->vm_ops->open) |
274 | tmp->vm_ops->open(tmp); | 269 | tmp->vm_ops->open(tmp); |
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
277 | goto out; | 272 | goto out; |
278 | } | 273 | } |
279 | retval = 0; | 274 | retval = 0; |
280 | |||
281 | out: | 275 | out: |
282 | flush_tlb_mm(current->mm); | 276 | up_write(&mm->mmap_sem); |
277 | flush_tlb_mm(oldmm); | ||
283 | up_write(&oldmm->mmap_sem); | 278 | up_write(&oldmm->mmap_sem); |
284 | return retval; | 279 | return retval; |
285 | fail_nomem_policy: | 280 | fail_nomem_policy: |
@@ -323,6 +318,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
323 | INIT_LIST_HEAD(&mm->mmlist); | 318 | INIT_LIST_HEAD(&mm->mmlist); |
324 | mm->core_waiters = 0; | 319 | mm->core_waiters = 0; |
325 | mm->nr_ptes = 0; | 320 | mm->nr_ptes = 0; |
321 | set_mm_counter(mm, file_rss, 0); | ||
322 | set_mm_counter(mm, anon_rss, 0); | ||
326 | spin_lock_init(&mm->page_table_lock); | 323 | spin_lock_init(&mm->page_table_lock); |
327 | rwlock_init(&mm->ioctx_list_lock); | 324 | rwlock_init(&mm->ioctx_list_lock); |
328 | mm->ioctx_list = NULL; | 325 | mm->ioctx_list = NULL; |
@@ -499,7 +496,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
499 | if (retval) | 496 | if (retval) |
500 | goto free_pt; | 497 | goto free_pt; |
501 | 498 | ||
502 | mm->hiwater_rss = get_mm_counter(mm,rss); | 499 | mm->hiwater_rss = get_mm_rss(mm); |
503 | mm->hiwater_vm = mm->total_vm; | 500 | mm->hiwater_vm = mm->total_vm; |
504 | 501 | ||
505 | good_mm: | 502 | good_mm: |
diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..3b4d5ad44cc6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | /* | 205 | /* |
206 | * Do a quick atomic lookup first - this is the fastpath. | 206 | * Do a quick atomic lookup first - this is the fastpath. |
207 | */ | 207 | */ |
208 | spin_lock(¤t->mm->page_table_lock); | 208 | page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET); |
209 | page = follow_page(mm, uaddr, 0); | ||
210 | if (likely(page != NULL)) { | 209 | if (likely(page != NULL)) { |
211 | key->shared.pgoff = | 210 | key->shared.pgoff = |
212 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
213 | spin_unlock(¤t->mm->page_table_lock); | 212 | put_page(page); |
214 | return 0; | 213 | return 0; |
215 | } | 214 | } |
216 | spin_unlock(¤t->mm->page_table_lock); | ||
217 | 215 | ||
218 | /* | 216 | /* |
219 | * Do it the general way. | 217 | * Do it the general way. |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/err.h> | 19 | #include <linux/err.h> |
20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
21 | #include <linux/sched.h> /* for cond_resched */ | ||
21 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
22 | 23 | ||
23 | #include <asm/sections.h> | 24 | #include <asm/sections.h> |
diff --git a/kernel/kexec.c b/kernel/kexec.c index cdd4dcd8fb63..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p) | |||
90 | static int kimage_is_destination_range(struct kimage *image, | 90 | static int kimage_is_destination_range(struct kimage *image, |
91 | unsigned long start, unsigned long end); | 91 | unsigned long start, unsigned long end); |
92 | static struct page *kimage_alloc_page(struct kimage *image, | 92 | static struct page *kimage_alloc_page(struct kimage *image, |
93 | unsigned int gfp_mask, | 93 | gfp_t gfp_mask, |
94 | unsigned long dest); | 94 | unsigned long dest); |
95 | 95 | ||
96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, |
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image, | |||
326 | return 0; | 326 | return 0; |
327 | } | 327 | } |
328 | 328 | ||
329 | static struct page *kimage_alloc_pages(unsigned int gfp_mask, | 329 | static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) |
330 | unsigned int order) | ||
331 | { | 330 | { |
332 | struct page *pages; | 331 | struct page *pages; |
333 | 332 | ||
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask, | |||
335 | if (pages) { | 334 | if (pages) { |
336 | unsigned int count, i; | 335 | unsigned int count, i; |
337 | pages->mapping = NULL; | 336 | pages->mapping = NULL; |
338 | pages->private = order; | 337 | set_page_private(pages, order); |
339 | count = 1 << order; | 338 | count = 1 << order; |
340 | for (i = 0; i < count; i++) | 339 | for (i = 0; i < count; i++) |
341 | SetPageReserved(pages + i); | 340 | SetPageReserved(pages + i); |
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page) | |||
348 | { | 347 | { |
349 | unsigned int order, count, i; | 348 | unsigned int order, count, i; |
350 | 349 | ||
351 | order = page->private; | 350 | order = page_private(page); |
352 | count = 1 << order; | 351 | count = 1 << order; |
353 | for (i = 0; i < count; i++) | 352 | for (i = 0; i < count; i++) |
354 | ClearPageReserved(page + i); | 353 | ClearPageReserved(page + i); |
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image, | |||
654 | } | 653 | } |
655 | 654 | ||
656 | static struct page *kimage_alloc_page(struct kimage *image, | 655 | static struct page *kimage_alloc_page(struct kimage *image, |
657 | unsigned int gfp_mask, | 656 | gfp_t gfp_mask, |
658 | unsigned long destination) | 657 | unsigned long destination) |
659 | { | 658 | { |
660 | /* | 659 | /* |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -131,14 +131,14 @@ struct subprocess_info { | |||
131 | static int ____call_usermodehelper(void *data) | 131 | static int ____call_usermodehelper(void *data) |
132 | { | 132 | { |
133 | struct subprocess_info *sub_info = data; | 133 | struct subprocess_info *sub_info = data; |
134 | struct key *old_session; | 134 | struct key *new_session, *old_session; |
135 | int retval; | 135 | int retval; |
136 | 136 | ||
137 | /* Unblock all signals and set the session keyring. */ | 137 | /* Unblock all signals and set the session keyring. */ |
138 | key_get(sub_info->ring); | 138 | new_session = key_get(sub_info->ring); |
139 | flush_signals(current); | 139 | flush_signals(current); |
140 | spin_lock_irq(¤t->sighand->siglock); | 140 | spin_lock_irq(¤t->sighand->siglock); |
141 | old_session = __install_session_keyring(current, sub_info->ring); | 141 | old_session = __install_session_keyring(current, new_session); |
142 | flush_signal_handlers(current, 1); | 142 | flush_signal_handlers(current, 1); |
143 | sigemptyset(¤t->blocked); | 143 | sigemptyset(¤t->blocked); |
144 | recalc_sigpending(); | 144 | recalc_sigpending(); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f3ea492ab44d..ce4915dd683a 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
36 | #include <linux/hash.h> | 36 | #include <linux/hash.h> |
37 | #include <linux/init.h> | 37 | #include <linux/init.h> |
38 | #include <linux/slab.h> | ||
38 | #include <linux/module.h> | 39 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 40 | #include <linux/moduleloader.h> |
40 | #include <asm-generic/sections.h> | 41 | #include <asm-generic/sections.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); | |||
165 | 165 | ||
166 | int kthread_stop(struct task_struct *k) | 166 | int kthread_stop(struct task_struct *k) |
167 | { | 167 | { |
168 | return kthread_stop_sem(k, NULL); | ||
169 | } | ||
170 | EXPORT_SYMBOL(kthread_stop); | ||
171 | |||
172 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | ||
173 | { | ||
168 | int ret; | 174 | int ret; |
169 | 175 | ||
170 | down(&kthread_stop_lock); | 176 | down(&kthread_stop_lock); |
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) | |||
178 | 184 | ||
179 | /* Now set kthread_should_stop() to true, and wake it up. */ | 185 | /* Now set kthread_should_stop() to true, and wake it up. */ |
180 | kthread_stop_info.k = k; | 186 | kthread_stop_info.k = k; |
181 | wake_up_process(k); | 187 | if (s) |
188 | up(s); | ||
189 | else | ||
190 | wake_up_process(k); | ||
182 | put_task_struct(k); | 191 | put_task_struct(k); |
183 | 192 | ||
184 | /* Once it dies, reset stop ptr, gather result and we're done. */ | 193 | /* Once it dies, reset stop ptr, gather result and we're done. */ |
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) | |||
189 | 198 | ||
190 | return ret; | 199 | return ret; |
191 | } | 200 | } |
192 | EXPORT_SYMBOL(kthread_stop); | 201 | EXPORT_SYMBOL(kthread_stop_sem); |
193 | 202 | ||
194 | static __init int helper_init(void) | 203 | static __init int helper_init(void) |
195 | { | 204 | { |
diff --git a/kernel/params.c b/kernel/params.c index 1a8614bac5d5..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/device.h> | 24 | #include <linux/device.h> |
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | ||
26 | 27 | ||
27 | #if 0 | 28 | #if 0 |
28 | #define DEBUGP printk | 29 | #define DEBUGP printk |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index bf374fceb39c..91a894264941 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1225 | /* | 1225 | /* |
1226 | * The task was cleaned up already, no future firings. | 1226 | * The task was cleaned up already, no future firings. |
1227 | */ | 1227 | */ |
1228 | return; | 1228 | goto out; |
1229 | 1229 | ||
1230 | /* | 1230 | /* |
1231 | * Fetch the current sample and update the timer's expiry time. | 1231 | * Fetch the current sample and update the timer's expiry time. |
@@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1235 | bump_cpu_timer(timer, now); | 1235 | bump_cpu_timer(timer, now); |
1236 | if (unlikely(p->exit_state)) { | 1236 | if (unlikely(p->exit_state)) { |
1237 | clear_dead_task(timer, now); | 1237 | clear_dead_task(timer, now); |
1238 | return; | 1238 | goto out; |
1239 | } | 1239 | } |
1240 | read_lock(&tasklist_lock); /* arm_timer needs it. */ | 1240 | read_lock(&tasklist_lock); /* arm_timer needs it. */ |
1241 | } else { | 1241 | } else { |
@@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1248 | put_task_struct(p); | 1248 | put_task_struct(p); |
1249 | timer->it.cpu.task = p = NULL; | 1249 | timer->it.cpu.task = p = NULL; |
1250 | timer->it.cpu.expires.sched = 0; | 1250 | timer->it.cpu.expires.sched = 0; |
1251 | read_unlock(&tasklist_lock); | 1251 | goto out_unlock; |
1252 | return; | ||
1253 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1252 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
1254 | /* | 1253 | /* |
1255 | * We've noticed that the thread is dead, but | 1254 | * We've noticed that the thread is dead, but |
@@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1257 | * drop our task ref. | 1256 | * drop our task ref. |
1258 | */ | 1257 | */ |
1259 | clear_dead_task(timer, now); | 1258 | clear_dead_task(timer, now); |
1260 | read_unlock(&tasklist_lock); | 1259 | goto out_unlock; |
1261 | return; | ||
1262 | } | 1260 | } |
1263 | cpu_clock_sample_group(timer->it_clock, p, &now); | 1261 | cpu_clock_sample_group(timer->it_clock, p, &now); |
1264 | bump_cpu_timer(timer, now); | 1262 | bump_cpu_timer(timer, now); |
@@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1270 | */ | 1268 | */ |
1271 | arm_timer(timer, now); | 1269 | arm_timer(timer, now); |
1272 | 1270 | ||
1271 | out_unlock: | ||
1273 | read_unlock(&tasklist_lock); | 1272 | read_unlock(&tasklist_lock); |
1273 | |||
1274 | out: | ||
1275 | timer->it_overrun_last = timer->it_overrun; | ||
1276 | timer->it_overrun = -1; | ||
1277 | ++timer->it_requeue_pending; | ||
1274 | } | 1278 | } |
1275 | 1279 | ||
1276 | /* | 1280 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index dda3cda73c77..ea55c7a1cd75 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | |||
1295 | return error; | 1295 | return error; |
1296 | } | 1296 | } |
1297 | 1297 | ||
1298 | static void nanosleep_wake_up(unsigned long __data) | ||
1299 | { | ||
1300 | struct task_struct *p = (struct task_struct *) __data; | ||
1301 | |||
1302 | wake_up_process(p); | ||
1303 | } | ||
1304 | |||
1305 | /* | 1298 | /* |
1306 | * The standard says that an absolute nanosleep call MUST wake up at | 1299 | * The standard says that an absolute nanosleep call MUST wake up at |
1307 | * the requested time in spite of clock settings. Here is what we do: | 1300 | * the requested time in spite of clock settings. Here is what we do: |
@@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock, | |||
1442 | int flags, struct timespec *tsave) | 1435 | int flags, struct timespec *tsave) |
1443 | { | 1436 | { |
1444 | struct timespec t, dum; | 1437 | struct timespec t, dum; |
1445 | struct timer_list new_timer; | ||
1446 | DECLARE_WAITQUEUE(abs_wqueue, current); | 1438 | DECLARE_WAITQUEUE(abs_wqueue, current); |
1447 | u64 rq_time = (u64)0; | 1439 | u64 rq_time = (u64)0; |
1448 | s64 left; | 1440 | s64 left; |
@@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock, | |||
1451 | ¤t_thread_info()->restart_block; | 1443 | ¤t_thread_info()->restart_block; |
1452 | 1444 | ||
1453 | abs_wqueue.flags = 0; | 1445 | abs_wqueue.flags = 0; |
1454 | init_timer(&new_timer); | ||
1455 | new_timer.expires = 0; | ||
1456 | new_timer.data = (unsigned long) current; | ||
1457 | new_timer.function = nanosleep_wake_up; | ||
1458 | abs = flags & TIMER_ABSTIME; | 1446 | abs = flags & TIMER_ABSTIME; |
1459 | 1447 | ||
1460 | if (restart_block->fn == clock_nanosleep_restart) { | 1448 | if (restart_block->fn == clock_nanosleep_restart) { |
@@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock, | |||
1490 | if (left < (s64)0) | 1478 | if (left < (s64)0) |
1491 | break; | 1479 | break; |
1492 | 1480 | ||
1493 | new_timer.expires = jiffies + left; | 1481 | schedule_timeout_interruptible(left); |
1494 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1495 | add_timer(&new_timer); | ||
1496 | |||
1497 | schedule(); | ||
1498 | 1482 | ||
1499 | del_timer_sync(&new_timer); | ||
1500 | left = rq_time - get_jiffies_64(); | 1483 | left = rq_time - get_jiffies_64(); |
1501 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | 1484 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); |
1502 | 1485 | ||
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..c71eb4579c07 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -4,7 +4,7 @@ EXTRA_CFLAGS += -DDEBUG | |||
4 | endif | 4 | endif |
5 | 5 | ||
6 | obj-y := main.o process.o console.o pm.o | 6 | obj-y := main.o process.o console.o pm.o |
7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o | 7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o |
8 | 8 | ||
9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | 9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o |
10 | 10 | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 761956e813f5..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -30,7 +30,6 @@ extern int swsusp_check(void); | |||
30 | extern int swsusp_read(void); | 30 | extern int swsusp_read(void); |
31 | extern void swsusp_close(void); | 31 | extern void swsusp_close(void); |
32 | extern int swsusp_resume(void); | 32 | extern int swsusp_resume(void); |
33 | extern int swsusp_free(void); | ||
34 | 33 | ||
35 | 34 | ||
36 | static int noresume = 0; | 35 | static int noresume = 0; |
@@ -93,10 +92,7 @@ static void free_some_memory(void) | |||
93 | printk("Freeing memory... "); | 92 | printk("Freeing memory... "); |
94 | while ((tmp = shrink_all_memory(10000))) { | 93 | while ((tmp = shrink_all_memory(10000))) { |
95 | pages += tmp; | 94 | pages += tmp; |
96 | printk("\b%c", p[i]); | 95 | printk("\b%c", p[i++ % 4]); |
97 | i++; | ||
98 | if (i > 3) | ||
99 | i = 0; | ||
100 | } | 96 | } |
101 | printk("\bdone (%li pages freed)\n", pages); | 97 | printk("\bdone (%li pages freed)\n", pages); |
102 | } | 98 | } |
@@ -178,13 +174,12 @@ int pm_suspend_disk(void) | |||
178 | goto Done; | 174 | goto Done; |
179 | 175 | ||
180 | if (in_suspend) { | 176 | if (in_suspend) { |
177 | device_resume(); | ||
181 | pr_debug("PM: writing image.\n"); | 178 | pr_debug("PM: writing image.\n"); |
182 | error = swsusp_write(); | 179 | error = swsusp_write(); |
183 | if (!error) | 180 | if (!error) |
184 | power_down(pm_disk_mode); | 181 | power_down(pm_disk_mode); |
185 | else { | 182 | else { |
186 | /* swsusp_write can not fail in device_resume, | ||
187 | no need to do second device_resume */ | ||
188 | swsusp_free(); | 183 | swsusp_free(); |
189 | unprepare_processes(); | 184 | unprepare_processes(); |
190 | return error; | 185 | return error; |
@@ -252,14 +247,17 @@ static int software_resume(void) | |||
252 | 247 | ||
253 | pr_debug("PM: Reading swsusp image.\n"); | 248 | pr_debug("PM: Reading swsusp image.\n"); |
254 | 249 | ||
255 | if ((error = swsusp_read())) | 250 | if ((error = swsusp_read())) { |
256 | goto Cleanup; | 251 | swsusp_free(); |
252 | goto Thaw; | ||
253 | } | ||
257 | 254 | ||
258 | pr_debug("PM: Preparing devices for restore.\n"); | 255 | pr_debug("PM: Preparing devices for restore.\n"); |
259 | 256 | ||
260 | if ((error = device_suspend(PMSG_FREEZE))) { | 257 | if ((error = device_suspend(PMSG_FREEZE))) { |
261 | printk("Some devices failed to suspend\n"); | 258 | printk("Some devices failed to suspend\n"); |
262 | goto Free; | 259 | swsusp_free(); |
260 | goto Thaw; | ||
263 | } | 261 | } |
264 | 262 | ||
265 | mb(); | 263 | mb(); |
@@ -268,9 +266,7 @@ static int software_resume(void) | |||
268 | swsusp_resume(); | 266 | swsusp_resume(); |
269 | pr_debug("PM: Restore failed, recovering.n"); | 267 | pr_debug("PM: Restore failed, recovering.n"); |
270 | device_resume(); | 268 | device_resume(); |
271 | Free: | 269 | Thaw: |
272 | swsusp_free(); | ||
273 | Cleanup: | ||
274 | unprepare_processes(); | 270 | unprepare_processes(); |
275 | Done: | 271 | Done: |
276 | /* For success case, the suspend path will release the lock */ | 272 | /* For success case, the suspend path will release the lock */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 22bdc93cc038..18d7d693fbba 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state) | |||
167 | { | 167 | { |
168 | int error; | 168 | int error; |
169 | 169 | ||
170 | if (pm_ops->valid && !pm_ops->valid(state)) | ||
171 | return -ENODEV; | ||
170 | if (down_trylock(&pm_sem)) | 172 | if (down_trylock(&pm_sem)) |
171 | return -EBUSY; | 173 | return -EBUSY; |
172 | 174 | ||
@@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
236 | char * s = buf; | 238 | char * s = buf; |
237 | 239 | ||
238 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 240 | for (i = 0; i < PM_SUSPEND_MAX; i++) { |
239 | if (pm_states[i]) | 241 | if (pm_states[i] && pm_ops && (!pm_ops->valid |
242 | ||(pm_ops->valid && pm_ops->valid(i)))) | ||
240 | s += sprintf(s,"%s ",pm_states[i]); | 243 | s += sprintf(s,"%s ",pm_states[i]); |
241 | } | 244 | } |
242 | s += sprintf(s,"\n"); | 245 | s += sprintf(s,"\n"); |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 6748de23e83c..d4fd96a135ab 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -53,3 +53,20 @@ extern void thaw_processes(void); | |||
53 | 53 | ||
54 | extern int pm_prepare_console(void); | 54 | extern int pm_prepare_console(void); |
55 | extern void pm_restore_console(void); | 55 | extern void pm_restore_console(void); |
56 | |||
57 | |||
58 | /* References to section boundaries */ | ||
59 | extern const void __nosave_begin, __nosave_end; | ||
60 | |||
61 | extern unsigned int nr_copy_pages; | ||
62 | extern suspend_pagedir_t *pagedir_nosave; | ||
63 | extern suspend_pagedir_t *pagedir_save; | ||
64 | |||
65 | extern asmlinkage int swsusp_arch_suspend(void); | ||
66 | extern asmlinkage int swsusp_arch_resume(void); | ||
67 | |||
68 | extern int restore_highmem(void); | ||
69 | extern struct pbe * alloc_pagedir(unsigned nr_pages); | ||
70 | extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); | ||
71 | extern void swsusp_free(void); | ||
72 | extern int enough_swap(unsigned nr_pages); | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..42a628704398 --- /dev/null +++ b/kernel/power/snapshot.c | |||
@@ -0,0 +1,435 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/snapshot.c | ||
3 | * | ||
4 | * This file provide system snapshot/restore functionality. | ||
5 | * | ||
6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | ||
7 | * | ||
8 | * This file is released under the GPLv2, and is based on swsusp.c. | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/suspend.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/spinlock.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/pm.h> | ||
22 | #include <linux/device.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/syscalls.h> | ||
25 | #include <linux/console.h> | ||
26 | #include <linux/highmem.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/mmu_context.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | #include <asm/tlbflush.h> | ||
32 | #include <asm/io.h> | ||
33 | |||
34 | #include "power.h" | ||
35 | |||
36 | #ifdef CONFIG_HIGHMEM | ||
37 | struct highmem_page { | ||
38 | char *data; | ||
39 | struct page *page; | ||
40 | struct highmem_page *next; | ||
41 | }; | ||
42 | |||
43 | static struct highmem_page *highmem_copy; | ||
44 | |||
45 | static int save_highmem_zone(struct zone *zone) | ||
46 | { | ||
47 | unsigned long zone_pfn; | ||
48 | mark_free_pages(zone); | ||
49 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
50 | struct page *page; | ||
51 | struct highmem_page *save; | ||
52 | void *kaddr; | ||
53 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
54 | |||
55 | if (!(pfn%1000)) | ||
56 | printk("."); | ||
57 | if (!pfn_valid(pfn)) | ||
58 | continue; | ||
59 | page = pfn_to_page(pfn); | ||
60 | /* | ||
61 | * This condition results from rvmalloc() sans vmalloc_32() | ||
62 | * and architectural memory reservations. This should be | ||
63 | * corrected eventually when the cases giving rise to this | ||
64 | * are better understood. | ||
65 | */ | ||
66 | if (PageReserved(page)) { | ||
67 | printk("highmem reserved page?!\n"); | ||
68 | continue; | ||
69 | } | ||
70 | BUG_ON(PageNosave(page)); | ||
71 | if (PageNosaveFree(page)) | ||
72 | continue; | ||
73 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
74 | if (!save) | ||
75 | return -ENOMEM; | ||
76 | save->next = highmem_copy; | ||
77 | save->page = page; | ||
78 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
79 | if (!save->data) { | ||
80 | kfree(save); | ||
81 | return -ENOMEM; | ||
82 | } | ||
83 | kaddr = kmap_atomic(page, KM_USER0); | ||
84 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
85 | kunmap_atomic(kaddr, KM_USER0); | ||
86 | highmem_copy = save; | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | |||
92 | static int save_highmem(void) | ||
93 | { | ||
94 | struct zone *zone; | ||
95 | int res = 0; | ||
96 | |||
97 | pr_debug("swsusp: Saving Highmem\n"); | ||
98 | for_each_zone (zone) { | ||
99 | if (is_highmem(zone)) | ||
100 | res = save_highmem_zone(zone); | ||
101 | if (res) | ||
102 | return res; | ||
103 | } | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | int restore_highmem(void) | ||
108 | { | ||
109 | printk("swsusp: Restoring Highmem\n"); | ||
110 | while (highmem_copy) { | ||
111 | struct highmem_page *save = highmem_copy; | ||
112 | void *kaddr; | ||
113 | highmem_copy = save->next; | ||
114 | |||
115 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
116 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
117 | kunmap_atomic(kaddr, KM_USER0); | ||
118 | free_page((long) save->data); | ||
119 | kfree(save); | ||
120 | } | ||
121 | return 0; | ||
122 | } | ||
123 | #else | ||
124 | static int save_highmem(void) { return 0; } | ||
125 | int restore_highmem(void) { return 0; } | ||
126 | #endif /* CONFIG_HIGHMEM */ | ||
127 | |||
128 | |||
129 | static int pfn_is_nosave(unsigned long pfn) | ||
130 | { | ||
131 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
132 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
133 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
134 | } | ||
135 | |||
136 | /** | ||
137 | * saveable - Determine whether a page should be cloned or not. | ||
138 | * @pfn: The page | ||
139 | * | ||
140 | * We save a page if it's Reserved, and not in the range of pages | ||
141 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
142 | * isn't part of a free chunk of pages. | ||
143 | */ | ||
144 | |||
145 | static int saveable(struct zone *zone, unsigned long *zone_pfn) | ||
146 | { | ||
147 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
148 | struct page *page; | ||
149 | |||
150 | if (!pfn_valid(pfn)) | ||
151 | return 0; | ||
152 | |||
153 | page = pfn_to_page(pfn); | ||
154 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
155 | if (PageNosave(page)) | ||
156 | return 0; | ||
157 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
158 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
159 | return 0; | ||
160 | } | ||
161 | if (PageNosaveFree(page)) | ||
162 | return 0; | ||
163 | |||
164 | return 1; | ||
165 | } | ||
166 | |||
167 | static unsigned count_data_pages(void) | ||
168 | { | ||
169 | struct zone *zone; | ||
170 | unsigned long zone_pfn; | ||
171 | unsigned n; | ||
172 | |||
173 | n = 0; | ||
174 | for_each_zone (zone) { | ||
175 | if (is_highmem(zone)) | ||
176 | continue; | ||
177 | mark_free_pages(zone); | ||
178 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
179 | n += saveable(zone, &zone_pfn); | ||
180 | } | ||
181 | return n; | ||
182 | } | ||
183 | |||
184 | static void copy_data_pages(struct pbe *pblist) | ||
185 | { | ||
186 | struct zone *zone; | ||
187 | unsigned long zone_pfn; | ||
188 | struct pbe *pbe, *p; | ||
189 | |||
190 | pbe = pblist; | ||
191 | for_each_zone (zone) { | ||
192 | if (is_highmem(zone)) | ||
193 | continue; | ||
194 | mark_free_pages(zone); | ||
195 | /* This is necessary for swsusp_free() */ | ||
196 | for_each_pb_page (p, pblist) | ||
197 | SetPageNosaveFree(virt_to_page(p)); | ||
198 | for_each_pbe (p, pblist) | ||
199 | SetPageNosaveFree(virt_to_page(p->address)); | ||
200 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
201 | if (saveable(zone, &zone_pfn)) { | ||
202 | struct page *page; | ||
203 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
204 | BUG_ON(!pbe); | ||
205 | pbe->orig_address = (unsigned long)page_address(page); | ||
206 | /* copy_page is not usable for copying task structs. */ | ||
207 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
208 | pbe = pbe->next; | ||
209 | } | ||
210 | } | ||
211 | } | ||
212 | BUG_ON(pbe); | ||
213 | } | ||
214 | |||
215 | |||
216 | /** | ||
217 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
218 | */ | ||
219 | |||
220 | static void free_pagedir(struct pbe *pblist) | ||
221 | { | ||
222 | struct pbe *pbe; | ||
223 | |||
224 | while (pblist) { | ||
225 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
226 | ClearPageNosave(virt_to_page(pblist)); | ||
227 | ClearPageNosaveFree(virt_to_page(pblist)); | ||
228 | free_page((unsigned long)pblist); | ||
229 | pblist = pbe; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | /** | ||
234 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
235 | */ | ||
236 | |||
237 | static inline void fill_pb_page(struct pbe *pbpage) | ||
238 | { | ||
239 | struct pbe *p; | ||
240 | |||
241 | p = pbpage; | ||
242 | pbpage += PB_PAGE_SKIP; | ||
243 | do | ||
244 | p->next = p + 1; | ||
245 | while (++p < pbpage); | ||
246 | } | ||
247 | |||
248 | /** | ||
249 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
250 | * of memory pages allocated with alloc_pagedir() | ||
251 | */ | ||
252 | |||
253 | void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
254 | { | ||
255 | struct pbe *pbpage, *p; | ||
256 | unsigned num = PBES_PER_PAGE; | ||
257 | |||
258 | for_each_pb_page (pbpage, pblist) { | ||
259 | if (num >= nr_pages) | ||
260 | break; | ||
261 | |||
262 | fill_pb_page(pbpage); | ||
263 | num += PBES_PER_PAGE; | ||
264 | } | ||
265 | if (pbpage) { | ||
266 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
267 | p->next = p + 1; | ||
268 | p->next = NULL; | ||
269 | } | ||
270 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
271 | } | ||
272 | |||
273 | static void *alloc_image_page(void) | ||
274 | { | ||
275 | void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
276 | if (res) { | ||
277 | SetPageNosave(virt_to_page(res)); | ||
278 | SetPageNosaveFree(virt_to_page(res)); | ||
279 | } | ||
280 | return res; | ||
281 | } | ||
282 | |||
283 | /** | ||
284 | * alloc_pagedir - Allocate the page directory. | ||
285 | * | ||
286 | * First, determine exactly how many pages we need and | ||
287 | * allocate them. | ||
288 | * | ||
289 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
290 | * struct pbe elements (pbes) and the last element in the page points | ||
291 | * to the next page. | ||
292 | * | ||
293 | * On each page we set up a list of struct_pbe elements. | ||
294 | */ | ||
295 | |||
296 | struct pbe *alloc_pagedir(unsigned nr_pages) | ||
297 | { | ||
298 | unsigned num; | ||
299 | struct pbe *pblist, *pbe; | ||
300 | |||
301 | if (!nr_pages) | ||
302 | return NULL; | ||
303 | |||
304 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
305 | pblist = alloc_image_page(); | ||
306 | /* FIXME: rewrite this ugly loop */ | ||
307 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
308 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
309 | pbe += PB_PAGE_SKIP; | ||
310 | pbe->next = alloc_image_page(); | ||
311 | } | ||
312 | if (!pbe) { /* get_zeroed_page() failed */ | ||
313 | free_pagedir(pblist); | ||
314 | pblist = NULL; | ||
315 | } | ||
316 | return pblist; | ||
317 | } | ||
318 | |||
319 | /** | ||
320 | * Free pages we allocated for suspend. Suspend pages are alocated | ||
321 | * before atomic copy, so we need to free them after resume. | ||
322 | */ | ||
323 | |||
324 | void swsusp_free(void) | ||
325 | { | ||
326 | struct zone *zone; | ||
327 | unsigned long zone_pfn; | ||
328 | |||
329 | for_each_zone(zone) { | ||
330 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
331 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { | ||
332 | struct page * page; | ||
333 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
334 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
335 | ClearPageNosave(page); | ||
336 | ClearPageNosaveFree(page); | ||
337 | free_page((long) page_address(page)); | ||
338 | } | ||
339 | } | ||
340 | } | ||
341 | } | ||
342 | |||
343 | |||
344 | /** | ||
345 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
346 | * | ||
347 | * Returns TRUE or FALSE after checking the number of available | ||
348 | * free pages. | ||
349 | */ | ||
350 | |||
351 | static int enough_free_mem(unsigned nr_pages) | ||
352 | { | ||
353 | pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); | ||
354 | return nr_free_pages() > (nr_pages + PAGES_FOR_IO + | ||
355 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
356 | } | ||
357 | |||
358 | |||
359 | static struct pbe *swsusp_alloc(unsigned nr_pages) | ||
360 | { | ||
361 | struct pbe *pblist, *p; | ||
362 | |||
363 | if (!(pblist = alloc_pagedir(nr_pages))) { | ||
364 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
365 | return NULL; | ||
366 | } | ||
367 | create_pbe_list(pblist, nr_pages); | ||
368 | |||
369 | for_each_pbe (p, pblist) { | ||
370 | p->address = (unsigned long)alloc_image_page(); | ||
371 | if (!p->address) { | ||
372 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
373 | swsusp_free(); | ||
374 | return NULL; | ||
375 | } | ||
376 | } | ||
377 | |||
378 | return pblist; | ||
379 | } | ||
380 | |||
381 | asmlinkage int swsusp_save(void) | ||
382 | { | ||
383 | unsigned nr_pages; | ||
384 | |||
385 | pr_debug("swsusp: critical section: \n"); | ||
386 | if (save_highmem()) { | ||
387 | printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n"); | ||
388 | restore_highmem(); | ||
389 | return -ENOMEM; | ||
390 | } | ||
391 | |||
392 | drain_local_pages(); | ||
393 | nr_pages = count_data_pages(); | ||
394 | printk("swsusp: Need to copy %u pages\n", nr_pages); | ||
395 | |||
396 | pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", | ||
397 | nr_pages, | ||
398 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | ||
399 | PAGES_FOR_IO, nr_free_pages()); | ||
400 | |||
401 | /* This is needed because of the fixed size of swsusp_info */ | ||
402 | if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) | ||
403 | return -ENOSPC; | ||
404 | |||
405 | if (!enough_free_mem(nr_pages)) { | ||
406 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | ||
407 | return -ENOMEM; | ||
408 | } | ||
409 | |||
410 | if (!enough_swap(nr_pages)) { | ||
411 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
412 | return -ENOSPC; | ||
413 | } | ||
414 | |||
415 | pagedir_nosave = swsusp_alloc(nr_pages); | ||
416 | if (!pagedir_nosave) | ||
417 | return -ENOMEM; | ||
418 | |||
419 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
420 | * Kill them. | ||
421 | */ | ||
422 | drain_local_pages(); | ||
423 | copy_data_pages(pagedir_nosave); | ||
424 | |||
425 | /* | ||
426 | * End of critical section. From now on, we can write to memory, | ||
427 | * but we should not touch disk. This specially means we must _not_ | ||
428 | * touch swap space! Except we must write out our image of course. | ||
429 | */ | ||
430 | |||
431 | nr_copy_pages = nr_pages; | ||
432 | |||
433 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | ||
434 | return 0; | ||
435 | } | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 2d5c45676442..12db1d2ad61f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -1,11 +1,10 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/power/swsusp.c | 2 | * linux/kernel/power/swsusp.c |
3 | * | 3 | * |
4 | * This file is to realize architecture-independent | 4 | * This file provides code to write suspend image to swap and read it back. |
5 | * machine suspend feature using pretty near only high-level routines | ||
6 | * | 5 | * |
7 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | 6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> |
8 | * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> |
9 | * | 8 | * |
10 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
11 | * | 10 | * |
@@ -47,11 +46,7 @@ | |||
47 | #include <linux/utsname.h> | 46 | #include <linux/utsname.h> |
48 | #include <linux/version.h> | 47 | #include <linux/version.h> |
49 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
50 | #include <linux/reboot.h> | ||
51 | #include <linux/bitops.h> | 49 | #include <linux/bitops.h> |
52 | #include <linux/vt_kern.h> | ||
53 | #include <linux/kbd_kern.h> | ||
54 | #include <linux/keyboard.h> | ||
55 | #include <linux/spinlock.h> | 50 | #include <linux/spinlock.h> |
56 | #include <linux/genhd.h> | 51 | #include <linux/genhd.h> |
57 | #include <linux/kernel.h> | 52 | #include <linux/kernel.h> |
@@ -63,10 +58,8 @@ | |||
63 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
64 | #include <linux/bootmem.h> | 59 | #include <linux/bootmem.h> |
65 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
66 | #include <linux/console.h> | ||
67 | #include <linux/highmem.h> | 61 | #include <linux/highmem.h> |
68 | #include <linux/bio.h> | 62 | #include <linux/bio.h> |
69 | #include <linux/mount.h> | ||
70 | 63 | ||
71 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
72 | #include <asm/mmu_context.h> | 65 | #include <asm/mmu_context.h> |
@@ -84,16 +77,10 @@ | |||
84 | #define MAXKEY 32 | 77 | #define MAXKEY 32 |
85 | #define MAXIV 32 | 78 | #define MAXIV 32 |
86 | 79 | ||
87 | /* References to section boundaries */ | ||
88 | extern const void __nosave_begin, __nosave_end; | ||
89 | |||
90 | /* Variables to be preserved over suspend */ | ||
91 | static int nr_copy_pages_check; | ||
92 | |||
93 | extern char resume_file[]; | 80 | extern char resume_file[]; |
94 | 81 | ||
95 | /* Local variables that should not be affected by save */ | 82 | /* Local variables that should not be affected by save */ |
96 | static unsigned int nr_copy_pages __nosavedata = 0; | 83 | unsigned int nr_copy_pages __nosavedata = 0; |
97 | 84 | ||
98 | /* Suspend pagedir is allocated before final copy, therefore it | 85 | /* Suspend pagedir is allocated before final copy, therefore it |
99 | must be freed after resume | 86 | must be freed after resume |
@@ -109,7 +96,7 @@ static unsigned int nr_copy_pages __nosavedata = 0; | |||
109 | MMU hardware. | 96 | MMU hardware. |
110 | */ | 97 | */ |
111 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | 98 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; |
112 | static suspend_pagedir_t *pagedir_save; | 99 | suspend_pagedir_t *pagedir_save; |
113 | 100 | ||
114 | #define SWSUSP_SIG "S1SUSPEND" | 101 | #define SWSUSP_SIG "S1SUSPEND" |
115 | 102 | ||
@@ -124,12 +111,6 @@ static struct swsusp_header { | |||
124 | static struct swsusp_info swsusp_info; | 111 | static struct swsusp_info swsusp_info; |
125 | 112 | ||
126 | /* | 113 | /* |
127 | * XXX: We try to keep some more pages free so that I/O operations succeed | ||
128 | * without paging. Might this be more? | ||
129 | */ | ||
130 | #define PAGES_FOR_IO 512 | ||
131 | |||
132 | /* | ||
133 | * Saving part... | 114 | * Saving part... |
134 | */ | 115 | */ |
135 | 116 | ||
@@ -552,346 +533,6 @@ static int write_suspend_image(void) | |||
552 | goto Done; | 533 | goto Done; |
553 | } | 534 | } |
554 | 535 | ||
555 | |||
556 | #ifdef CONFIG_HIGHMEM | ||
557 | struct highmem_page { | ||
558 | char *data; | ||
559 | struct page *page; | ||
560 | struct highmem_page *next; | ||
561 | }; | ||
562 | |||
563 | static struct highmem_page *highmem_copy; | ||
564 | |||
565 | static int save_highmem_zone(struct zone *zone) | ||
566 | { | ||
567 | unsigned long zone_pfn; | ||
568 | mark_free_pages(zone); | ||
569 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
570 | struct page *page; | ||
571 | struct highmem_page *save; | ||
572 | void *kaddr; | ||
573 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
574 | |||
575 | if (!(pfn%1000)) | ||
576 | printk("."); | ||
577 | if (!pfn_valid(pfn)) | ||
578 | continue; | ||
579 | page = pfn_to_page(pfn); | ||
580 | /* | ||
581 | * This condition results from rvmalloc() sans vmalloc_32() | ||
582 | * and architectural memory reservations. This should be | ||
583 | * corrected eventually when the cases giving rise to this | ||
584 | * are better understood. | ||
585 | */ | ||
586 | if (PageReserved(page)) { | ||
587 | printk("highmem reserved page?!\n"); | ||
588 | continue; | ||
589 | } | ||
590 | BUG_ON(PageNosave(page)); | ||
591 | if (PageNosaveFree(page)) | ||
592 | continue; | ||
593 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
594 | if (!save) | ||
595 | return -ENOMEM; | ||
596 | save->next = highmem_copy; | ||
597 | save->page = page; | ||
598 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
599 | if (!save->data) { | ||
600 | kfree(save); | ||
601 | return -ENOMEM; | ||
602 | } | ||
603 | kaddr = kmap_atomic(page, KM_USER0); | ||
604 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
605 | kunmap_atomic(kaddr, KM_USER0); | ||
606 | highmem_copy = save; | ||
607 | } | ||
608 | return 0; | ||
609 | } | ||
610 | #endif /* CONFIG_HIGHMEM */ | ||
611 | |||
612 | |||
613 | static int save_highmem(void) | ||
614 | { | ||
615 | #ifdef CONFIG_HIGHMEM | ||
616 | struct zone *zone; | ||
617 | int res = 0; | ||
618 | |||
619 | pr_debug("swsusp: Saving Highmem\n"); | ||
620 | for_each_zone (zone) { | ||
621 | if (is_highmem(zone)) | ||
622 | res = save_highmem_zone(zone); | ||
623 | if (res) | ||
624 | return res; | ||
625 | } | ||
626 | #endif | ||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | static int restore_highmem(void) | ||
631 | { | ||
632 | #ifdef CONFIG_HIGHMEM | ||
633 | printk("swsusp: Restoring Highmem\n"); | ||
634 | while (highmem_copy) { | ||
635 | struct highmem_page *save = highmem_copy; | ||
636 | void *kaddr; | ||
637 | highmem_copy = save->next; | ||
638 | |||
639 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
640 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
641 | kunmap_atomic(kaddr, KM_USER0); | ||
642 | free_page((long) save->data); | ||
643 | kfree(save); | ||
644 | } | ||
645 | #endif | ||
646 | return 0; | ||
647 | } | ||
648 | |||
649 | |||
650 | static int pfn_is_nosave(unsigned long pfn) | ||
651 | { | ||
652 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
653 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
654 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
655 | } | ||
656 | |||
657 | /** | ||
658 | * saveable - Determine whether a page should be cloned or not. | ||
659 | * @pfn: The page | ||
660 | * | ||
661 | * We save a page if it's Reserved, and not in the range of pages | ||
662 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
663 | * isn't part of a free chunk of pages. | ||
664 | */ | ||
665 | |||
666 | static int saveable(struct zone * zone, unsigned long * zone_pfn) | ||
667 | { | ||
668 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
669 | struct page * page; | ||
670 | |||
671 | if (!pfn_valid(pfn)) | ||
672 | return 0; | ||
673 | |||
674 | page = pfn_to_page(pfn); | ||
675 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
676 | if (PageNosave(page)) | ||
677 | return 0; | ||
678 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
679 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
680 | return 0; | ||
681 | } | ||
682 | if (PageNosaveFree(page)) | ||
683 | return 0; | ||
684 | |||
685 | return 1; | ||
686 | } | ||
687 | |||
688 | static void count_data_pages(void) | ||
689 | { | ||
690 | struct zone *zone; | ||
691 | unsigned long zone_pfn; | ||
692 | |||
693 | nr_copy_pages = 0; | ||
694 | |||
695 | for_each_zone (zone) { | ||
696 | if (is_highmem(zone)) | ||
697 | continue; | ||
698 | mark_free_pages(zone); | ||
699 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
700 | nr_copy_pages += saveable(zone, &zone_pfn); | ||
701 | } | ||
702 | } | ||
703 | |||
704 | |||
705 | static void copy_data_pages(void) | ||
706 | { | ||
707 | struct zone *zone; | ||
708 | unsigned long zone_pfn; | ||
709 | struct pbe * pbe = pagedir_nosave; | ||
710 | |||
711 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | ||
712 | for_each_zone (zone) { | ||
713 | if (is_highmem(zone)) | ||
714 | continue; | ||
715 | mark_free_pages(zone); | ||
716 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
717 | if (saveable(zone, &zone_pfn)) { | ||
718 | struct page * page; | ||
719 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
720 | BUG_ON(!pbe); | ||
721 | pbe->orig_address = (long) page_address(page); | ||
722 | /* copy_page is not usable for copying task structs. */ | ||
723 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
724 | pbe = pbe->next; | ||
725 | } | ||
726 | } | ||
727 | } | ||
728 | BUG_ON(pbe); | ||
729 | } | ||
730 | |||
731 | |||
732 | /** | ||
733 | * calc_nr - Determine the number of pages needed for a pbe list. | ||
734 | */ | ||
735 | |||
736 | static int calc_nr(int nr_copy) | ||
737 | { | ||
738 | return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); | ||
739 | } | ||
740 | |||
741 | /** | ||
742 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
743 | */ | ||
744 | |||
745 | static inline void free_pagedir(struct pbe *pblist) | ||
746 | { | ||
747 | struct pbe *pbe; | ||
748 | |||
749 | while (pblist) { | ||
750 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
751 | free_page((unsigned long)pblist); | ||
752 | pblist = pbe; | ||
753 | } | ||
754 | } | ||
755 | |||
756 | /** | ||
757 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
758 | */ | ||
759 | |||
760 | static inline void fill_pb_page(struct pbe *pbpage) | ||
761 | { | ||
762 | struct pbe *p; | ||
763 | |||
764 | p = pbpage; | ||
765 | pbpage += PB_PAGE_SKIP; | ||
766 | do | ||
767 | p->next = p + 1; | ||
768 | while (++p < pbpage); | ||
769 | } | ||
770 | |||
771 | /** | ||
772 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
773 | * of memory pages allocated with alloc_pagedir() | ||
774 | */ | ||
775 | |||
776 | static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
777 | { | ||
778 | struct pbe *pbpage, *p; | ||
779 | unsigned num = PBES_PER_PAGE; | ||
780 | |||
781 | for_each_pb_page (pbpage, pblist) { | ||
782 | if (num >= nr_pages) | ||
783 | break; | ||
784 | |||
785 | fill_pb_page(pbpage); | ||
786 | num += PBES_PER_PAGE; | ||
787 | } | ||
788 | if (pbpage) { | ||
789 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
790 | p->next = p + 1; | ||
791 | p->next = NULL; | ||
792 | } | ||
793 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
794 | } | ||
795 | |||
796 | /** | ||
797 | * alloc_pagedir - Allocate the page directory. | ||
798 | * | ||
799 | * First, determine exactly how many pages we need and | ||
800 | * allocate them. | ||
801 | * | ||
802 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
803 | * struct pbe elements (pbes) and the last element in the page points | ||
804 | * to the next page. | ||
805 | * | ||
806 | * On each page we set up a list of struct_pbe elements. | ||
807 | */ | ||
808 | |||
809 | static struct pbe * alloc_pagedir(unsigned nr_pages) | ||
810 | { | ||
811 | unsigned num; | ||
812 | struct pbe *pblist, *pbe; | ||
813 | |||
814 | if (!nr_pages) | ||
815 | return NULL; | ||
816 | |||
817 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
818 | pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
819 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
820 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
821 | pbe += PB_PAGE_SKIP; | ||
822 | pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
823 | } | ||
824 | if (!pbe) { /* get_zeroed_page() failed */ | ||
825 | free_pagedir(pblist); | ||
826 | pblist = NULL; | ||
827 | } | ||
828 | return pblist; | ||
829 | } | ||
830 | |||
831 | /** | ||
832 | * free_image_pages - Free pages allocated for snapshot | ||
833 | */ | ||
834 | |||
835 | static void free_image_pages(void) | ||
836 | { | ||
837 | struct pbe * p; | ||
838 | |||
839 | for_each_pbe (p, pagedir_save) { | ||
840 | if (p->address) { | ||
841 | ClearPageNosave(virt_to_page(p->address)); | ||
842 | free_page(p->address); | ||
843 | p->address = 0; | ||
844 | } | ||
845 | } | ||
846 | } | ||
847 | |||
848 | /** | ||
849 | * alloc_image_pages - Allocate pages for the snapshot. | ||
850 | */ | ||
851 | |||
852 | static int alloc_image_pages(void) | ||
853 | { | ||
854 | struct pbe * p; | ||
855 | |||
856 | for_each_pbe (p, pagedir_save) { | ||
857 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
858 | if (!p->address) | ||
859 | return -ENOMEM; | ||
860 | SetPageNosave(virt_to_page(p->address)); | ||
861 | } | ||
862 | return 0; | ||
863 | } | ||
864 | |||
865 | /* Free pages we allocated for suspend. Suspend pages are alocated | ||
866 | * before atomic copy, so we need to free them after resume. | ||
867 | */ | ||
868 | void swsusp_free(void) | ||
869 | { | ||
870 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); | ||
871 | BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); | ||
872 | free_image_pages(); | ||
873 | free_pagedir(pagedir_save); | ||
874 | } | ||
875 | |||
876 | |||
877 | /** | ||
878 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
879 | * | ||
880 | * Returns TRUE or FALSE after checking the number of available | ||
881 | * free pages. | ||
882 | */ | ||
883 | |||
884 | static int enough_free_mem(void) | ||
885 | { | ||
886 | if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { | ||
887 | pr_debug("swsusp: Not enough free pages: Have %d\n", | ||
888 | nr_free_pages()); | ||
889 | return 0; | ||
890 | } | ||
891 | return 1; | ||
892 | } | ||
893 | |||
894 | |||
895 | /** | 536 | /** |
896 | * enough_swap - Make sure we have enough swap to save the image. | 537 | * enough_swap - Make sure we have enough swap to save the image. |
897 | * | 538 | * |
@@ -902,87 +543,14 @@ static int enough_free_mem(void) | |||
902 | * We should only consider resume_device. | 543 | * We should only consider resume_device. |
903 | */ | 544 | */ |
904 | 545 | ||
905 | static int enough_swap(void) | 546 | int enough_swap(unsigned nr_pages) |
906 | { | 547 | { |
907 | struct sysinfo i; | 548 | struct sysinfo i; |
908 | 549 | ||
909 | si_swapinfo(&i); | 550 | si_swapinfo(&i); |
910 | if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { | 551 | pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); |
911 | pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); | 552 | return i.freeswap > (nr_pages + PAGES_FOR_IO + |
912 | return 0; | 553 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); |
913 | } | ||
914 | return 1; | ||
915 | } | ||
916 | |||
917 | static int swsusp_alloc(void) | ||
918 | { | ||
919 | int error; | ||
920 | |||
921 | pagedir_nosave = NULL; | ||
922 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
923 | nr_copy_pages_check = nr_copy_pages; | ||
924 | |||
925 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | ||
926 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | ||
927 | |||
928 | if (!enough_free_mem()) | ||
929 | return -ENOMEM; | ||
930 | |||
931 | if (!enough_swap()) | ||
932 | return -ENOSPC; | ||
933 | |||
934 | if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE + | ||
935 | !!(nr_copy_pages % PBES_PER_PAGE)) | ||
936 | return -ENOSPC; | ||
937 | |||
938 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | ||
939 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
940 | return -ENOMEM; | ||
941 | } | ||
942 | create_pbe_list(pagedir_save, nr_copy_pages); | ||
943 | pagedir_nosave = pagedir_save; | ||
944 | if ((error = alloc_image_pages())) { | ||
945 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
946 | swsusp_free(); | ||
947 | return error; | ||
948 | } | ||
949 | |||
950 | return 0; | ||
951 | } | ||
952 | |||
953 | static int suspend_prepare_image(void) | ||
954 | { | ||
955 | int error; | ||
956 | |||
957 | pr_debug("swsusp: critical section: \n"); | ||
958 | if (save_highmem()) { | ||
959 | printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); | ||
960 | restore_highmem(); | ||
961 | return -ENOMEM; | ||
962 | } | ||
963 | |||
964 | drain_local_pages(); | ||
965 | count_data_pages(); | ||
966 | printk("swsusp: Need to copy %u pages\n", nr_copy_pages); | ||
967 | |||
968 | error = swsusp_alloc(); | ||
969 | if (error) | ||
970 | return error; | ||
971 | |||
972 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
973 | * Kill them. | ||
974 | */ | ||
975 | drain_local_pages(); | ||
976 | copy_data_pages(); | ||
977 | |||
978 | /* | ||
979 | * End of critical section. From now on, we can write to memory, | ||
980 | * but we should not touch disk. This specially means we must _not_ | ||
981 | * touch swap space! Except we must write out our image of course. | ||
982 | */ | ||
983 | |||
984 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); | ||
985 | return 0; | ||
986 | } | 554 | } |
987 | 555 | ||
988 | 556 | ||
@@ -994,7 +562,7 @@ static int suspend_prepare_image(void) | |||
994 | int swsusp_write(void) | 562 | int swsusp_write(void) |
995 | { | 563 | { |
996 | int error; | 564 | int error; |
997 | device_resume(); | 565 | |
998 | lock_swapdevices(); | 566 | lock_swapdevices(); |
999 | error = write_suspend_image(); | 567 | error = write_suspend_image(); |
1000 | /* This will unlock ignored swap devices since writing is finished */ | 568 | /* This will unlock ignored swap devices since writing is finished */ |
@@ -1004,14 +572,6 @@ int swsusp_write(void) | |||
1004 | } | 572 | } |
1005 | 573 | ||
1006 | 574 | ||
1007 | extern asmlinkage int swsusp_arch_suspend(void); | ||
1008 | extern asmlinkage int swsusp_arch_resume(void); | ||
1009 | |||
1010 | |||
1011 | asmlinkage int swsusp_save(void) | ||
1012 | { | ||
1013 | return suspend_prepare_image(); | ||
1014 | } | ||
1015 | 575 | ||
1016 | int swsusp_suspend(void) | 576 | int swsusp_suspend(void) |
1017 | { | 577 | { |
@@ -1043,7 +603,6 @@ int swsusp_suspend(void) | |||
1043 | printk(KERN_ERR "Error %d suspending\n", error); | 603 | printk(KERN_ERR "Error %d suspending\n", error); |
1044 | /* Restore control flow magically appears here */ | 604 | /* Restore control flow magically appears here */ |
1045 | restore_processor_state(); | 605 | restore_processor_state(); |
1046 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | ||
1047 | restore_highmem(); | 606 | restore_highmem(); |
1048 | device_power_up(); | 607 | device_power_up(); |
1049 | local_irq_enable(); | 608 | local_irq_enable(); |
@@ -1063,6 +622,11 @@ int swsusp_resume(void) | |||
1063 | * execution continues at place where swsusp_arch_suspend was called | 622 | * execution continues at place where swsusp_arch_suspend was called |
1064 | */ | 623 | */ |
1065 | BUG_ON(!error); | 624 | BUG_ON(!error); |
625 | /* The only reason why swsusp_arch_resume() can fail is memory being | ||
626 | * very tight, so we have to free it as soon as we can to avoid | ||
627 | * subsequent failures | ||
628 | */ | ||
629 | swsusp_free(); | ||
1066 | restore_processor_state(); | 630 | restore_processor_state(); |
1067 | restore_highmem(); | 631 | restore_highmem(); |
1068 | touch_softlockup_watchdog(); | 632 | touch_softlockup_watchdog(); |
@@ -1078,54 +642,28 @@ int swsusp_resume(void) | |||
1078 | * | 642 | * |
1079 | * We don't know which pages are usable until we allocate them. | 643 | * We don't know which pages are usable until we allocate them. |
1080 | * | 644 | * |
1081 | * Allocated but unusable (ie eaten) memory pages are linked together | 645 | * Allocated but unusable (ie eaten) memory pages are marked so that |
1082 | * to create a list, so that we can free them easily | 646 | * swsusp_free() can release them |
1083 | * | ||
1084 | * We could have used a type other than (void *) | ||
1085 | * for this purpose, but ... | ||
1086 | */ | 647 | */ |
1087 | static void **eaten_memory = NULL; | ||
1088 | |||
1089 | static inline void eat_page(void *page) | ||
1090 | { | ||
1091 | void **c; | ||
1092 | |||
1093 | c = eaten_memory; | ||
1094 | eaten_memory = page; | ||
1095 | *eaten_memory = c; | ||
1096 | } | ||
1097 | 648 | ||
1098 | unsigned long get_usable_page(unsigned gfp_mask) | 649 | unsigned long get_safe_page(gfp_t gfp_mask) |
1099 | { | 650 | { |
1100 | unsigned long m; | 651 | unsigned long m; |
1101 | 652 | ||
1102 | m = get_zeroed_page(gfp_mask); | 653 | do { |
1103 | while (!PageNosaveFree(virt_to_page(m))) { | ||
1104 | eat_page((void *)m); | ||
1105 | m = get_zeroed_page(gfp_mask); | 654 | m = get_zeroed_page(gfp_mask); |
1106 | if (!m) | 655 | if (m && PageNosaveFree(virt_to_page(m))) |
1107 | break; | 656 | /* This is for swsusp_free() */ |
657 | SetPageNosave(virt_to_page(m)); | ||
658 | } while (m && PageNosaveFree(virt_to_page(m))); | ||
659 | if (m) { | ||
660 | /* This is for swsusp_free() */ | ||
661 | SetPageNosave(virt_to_page(m)); | ||
662 | SetPageNosaveFree(virt_to_page(m)); | ||
1108 | } | 663 | } |
1109 | return m; | 664 | return m; |
1110 | } | 665 | } |
1111 | 666 | ||
1112 | void free_eaten_memory(void) | ||
1113 | { | ||
1114 | unsigned long m; | ||
1115 | void **c; | ||
1116 | int i = 0; | ||
1117 | |||
1118 | c = eaten_memory; | ||
1119 | while (c) { | ||
1120 | m = (unsigned long)c; | ||
1121 | c = *c; | ||
1122 | free_page(m); | ||
1123 | i++; | ||
1124 | } | ||
1125 | eaten_memory = NULL; | ||
1126 | pr_debug("swsusp: %d unused pages freed\n", i); | ||
1127 | } | ||
1128 | |||
1129 | /** | 667 | /** |
1130 | * check_pagedir - We ensure here that pages that the PBEs point to | 668 | * check_pagedir - We ensure here that pages that the PBEs point to |
1131 | * won't collide with pages where we're going to restore from the loaded | 669 | * won't collide with pages where we're going to restore from the loaded |
@@ -1143,7 +681,7 @@ static int check_pagedir(struct pbe *pblist) | |||
1143 | p->address = 0UL; | 681 | p->address = 0UL; |
1144 | 682 | ||
1145 | for_each_pbe (p, pblist) { | 683 | for_each_pbe (p, pblist) { |
1146 | p->address = get_usable_page(GFP_ATOMIC); | 684 | p->address = get_safe_page(GFP_ATOMIC); |
1147 | if (!p->address) | 685 | if (!p->address) |
1148 | return -ENOMEM; | 686 | return -ENOMEM; |
1149 | } | 687 | } |
@@ -1162,7 +700,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1162 | unsigned long zone_pfn; | 700 | unsigned long zone_pfn; |
1163 | struct pbe *pbpage, *tail, *p; | 701 | struct pbe *pbpage, *tail, *p; |
1164 | void *m; | 702 | void *m; |
1165 | int rel = 0, error = 0; | 703 | int rel = 0; |
1166 | 704 | ||
1167 | if (!pblist) /* a sanity check */ | 705 | if (!pblist) /* a sanity check */ |
1168 | return NULL; | 706 | return NULL; |
@@ -1170,41 +708,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1170 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", | 708 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", |
1171 | swsusp_info.pagedir_pages); | 709 | swsusp_info.pagedir_pages); |
1172 | 710 | ||
1173 | /* Set page flags */ | 711 | /* Clear page flags */ |
1174 | 712 | ||
1175 | for_each_zone (zone) { | 713 | for_each_zone (zone) { |
1176 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 714 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
1177 | SetPageNosaveFree(pfn_to_page(zone_pfn + | 715 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) |
716 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
1178 | zone->zone_start_pfn)); | 717 | zone->zone_start_pfn)); |
1179 | } | 718 | } |
1180 | 719 | ||
1181 | /* Clear orig addresses */ | 720 | /* Mark orig addresses */ |
1182 | 721 | ||
1183 | for_each_pbe (p, pblist) | 722 | for_each_pbe (p, pblist) |
1184 | ClearPageNosaveFree(virt_to_page(p->orig_address)); | 723 | SetPageNosaveFree(virt_to_page(p->orig_address)); |
1185 | 724 | ||
1186 | tail = pblist + PB_PAGE_SKIP; | 725 | tail = pblist + PB_PAGE_SKIP; |
1187 | 726 | ||
1188 | /* Relocate colliding pages */ | 727 | /* Relocate colliding pages */ |
1189 | 728 | ||
1190 | for_each_pb_page (pbpage, pblist) { | 729 | for_each_pb_page (pbpage, pblist) { |
1191 | if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { | 730 | if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) { |
1192 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | 731 | m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD); |
1193 | if (!m) { | 732 | if (!m) |
1194 | error = -ENOMEM; | 733 | return NULL; |
1195 | break; | ||
1196 | } | ||
1197 | memcpy(m, (void *)pbpage, PAGE_SIZE); | 734 | memcpy(m, (void *)pbpage, PAGE_SIZE); |
1198 | if (pbpage == pblist) | 735 | if (pbpage == pblist) |
1199 | pblist = (struct pbe *)m; | 736 | pblist = (struct pbe *)m; |
1200 | else | 737 | else |
1201 | tail->next = (struct pbe *)m; | 738 | tail->next = (struct pbe *)m; |
1202 | |||
1203 | eat_page((void *)pbpage); | ||
1204 | pbpage = (struct pbe *)m; | 739 | pbpage = (struct pbe *)m; |
1205 | 740 | ||
1206 | /* We have to link the PBEs again */ | 741 | /* We have to link the PBEs again */ |
1207 | |||
1208 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) | 742 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) |
1209 | if (p->next) /* needed to save the end */ | 743 | if (p->next) /* needed to save the end */ |
1210 | p->next = p + 1; | 744 | p->next = p + 1; |
@@ -1214,15 +748,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1214 | tail = pbpage + PB_PAGE_SKIP; | 748 | tail = pbpage + PB_PAGE_SKIP; |
1215 | } | 749 | } |
1216 | 750 | ||
1217 | if (error) { | 751 | /* This is for swsusp_free() */ |
1218 | printk("\nswsusp: Out of memory\n\n"); | 752 | for_each_pb_page (pbpage, pblist) { |
1219 | free_pagedir(pblist); | 753 | SetPageNosave(virt_to_page(pbpage)); |
1220 | free_eaten_memory(); | 754 | SetPageNosaveFree(virt_to_page(pbpage)); |
1221 | pblist = NULL; | 755 | } |
1222 | /* Is this even worth handling? It should never ever happen, and we | 756 | |
1223 | have just lost user's state, anyway... */ | 757 | printk("swsusp: Relocated %d pages\n", rel); |
1224 | } else | ||
1225 | printk("swsusp: Relocated %d pages\n", rel); | ||
1226 | 758 | ||
1227 | return pblist; | 759 | return pblist; |
1228 | } | 760 | } |
@@ -1440,9 +972,7 @@ static int read_pagedir(struct pbe *pblist) | |||
1440 | break; | 972 | break; |
1441 | } | 973 | } |
1442 | 974 | ||
1443 | if (error) | 975 | if (!error) |
1444 | free_pagedir(pblist); | ||
1445 | else | ||
1446 | BUG_ON(i != swsusp_info.pagedir_pages); | 976 | BUG_ON(i != swsusp_info.pagedir_pages); |
1447 | 977 | ||
1448 | return error; | 978 | return error; |
@@ -1485,15 +1015,6 @@ static int read_suspend_image(void) | |||
1485 | if (!error) | 1015 | if (!error) |
1486 | error = data_read(pagedir_nosave); | 1016 | error = data_read(pagedir_nosave); |
1487 | 1017 | ||
1488 | if (error) { /* We fail cleanly */ | ||
1489 | free_eaten_memory(); | ||
1490 | for_each_pbe (p, pagedir_nosave) | ||
1491 | if (p->address) { | ||
1492 | free_page(p->address); | ||
1493 | p->address = 0UL; | ||
1494 | } | ||
1495 | free_pagedir(pagedir_nosave); | ||
1496 | } | ||
1497 | return error; | 1018 | return error; |
1498 | } | 1019 | } |
1499 | 1020 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 4b8f0f9230a4..3cb9708209bc 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * elsewhere, in preparation for a serial line console (someday). | 10 | * elsewhere, in preparation for a serial line console (someday). |
11 | * Ted Ts'o, 2/11/93. | 11 | * Ted Ts'o, 2/11/93. |
12 | * Modified for sysctl support, 1/8/97, Chris Horn. | 12 | * Modified for sysctl support, 1/8/97, Chris Horn. |
13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul | 13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul |
14 | * manfreds@colorfullife.com | 14 | * manfreds@colorfullife.com |
15 | * Rewrote bits to get rid of console_lock | 15 | * Rewrote bits to get rid of console_lock |
16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> | 16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> |
@@ -148,7 +148,7 @@ static int __init console_setup(char *str) | |||
148 | if (!strcmp(str, "ttyb")) | 148 | if (!strcmp(str, "ttyb")) |
149 | strcpy(name, "ttyS1"); | 149 | strcpy(name, "ttyS1"); |
150 | #endif | 150 | #endif |
151 | for(s = name; *s; s++) | 151 | for (s = name; *s; s++) |
152 | if ((*s >= '0' && *s <= '9') || *s == ',') | 152 | if ((*s >= '0' && *s <= '9') || *s == ',') |
153 | break; | 153 | break; |
154 | idx = simple_strtoul(s, NULL, 10); | 154 | idx = simple_strtoul(s, NULL, 10); |
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) | |||
169 | size = roundup_pow_of_two(size); | 169 | size = roundup_pow_of_two(size); |
170 | if (size > log_buf_len) { | 170 | if (size > log_buf_len) { |
171 | unsigned long start, dest_idx, offset; | 171 | unsigned long start, dest_idx, offset; |
172 | char * new_log_buf; | 172 | char *new_log_buf; |
173 | 173 | ||
174 | new_log_buf = alloc_bootmem(size); | 174 | new_log_buf = alloc_bootmem(size); |
175 | if (!new_log_buf) { | 175 | if (!new_log_buf) { |
176 | printk("log_buf_len: allocation failed\n"); | 176 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); |
177 | goto out; | 177 | goto out; |
178 | } | 178 | } |
179 | 179 | ||
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) | |||
193 | log_end -= offset; | 193 | log_end -= offset; |
194 | spin_unlock_irqrestore(&logbuf_lock, flags); | 194 | spin_unlock_irqrestore(&logbuf_lock, flags); |
195 | 195 | ||
196 | printk("log_buf_len: %d\n", log_buf_len); | 196 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); |
197 | } | 197 | } |
198 | out: | 198 | out: |
199 | |||
200 | return 1; | 199 | return 1; |
201 | } | 200 | } |
202 | 201 | ||
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); | |||
217 | * 9 -- Return number of unread characters in the log buffer | 216 | * 9 -- Return number of unread characters in the log buffer |
218 | * 10 -- Return size of the log buffer | 217 | * 10 -- Return size of the log buffer |
219 | */ | 218 | */ |
220 | int do_syslog(int type, char __user * buf, int len) | 219 | int do_syslog(int type, char __user *buf, int len) |
221 | { | 220 | { |
222 | unsigned long i, j, limit, count; | 221 | unsigned long i, j, limit, count; |
223 | int do_clear = 0; | 222 | int do_clear = 0; |
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) | |||
244 | error = -EFAULT; | 243 | error = -EFAULT; |
245 | goto out; | 244 | goto out; |
246 | } | 245 | } |
247 | error = wait_event_interruptible(log_wait, (log_start - log_end)); | 246 | error = wait_event_interruptible(log_wait, |
247 | (log_start - log_end)); | ||
248 | if (error) | 248 | if (error) |
249 | goto out; | 249 | goto out; |
250 | i = 0; | 250 | i = 0; |
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) | |||
264 | error = i; | 264 | error = i; |
265 | break; | 265 | break; |
266 | case 4: /* Read/clear last kernel messages */ | 266 | case 4: /* Read/clear last kernel messages */ |
267 | do_clear = 1; | 267 | do_clear = 1; |
268 | /* FALL THRU */ | 268 | /* FALL THRU */ |
269 | case 3: /* Read last kernel messages */ | 269 | case 3: /* Read last kernel messages */ |
270 | error = -EINVAL; | 270 | error = -EINVAL; |
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) | |||
288 | limit = log_end; | 288 | limit = log_end; |
289 | /* | 289 | /* |
290 | * __put_user() could sleep, and while we sleep | 290 | * __put_user() could sleep, and while we sleep |
291 | * printk() could overwrite the messages | 291 | * printk() could overwrite the messages |
292 | * we try to copy to user space. Therefore | 292 | * we try to copy to user space. Therefore |
293 | * the messages are copied in reverse. <manfreds> | 293 | * the messages are copied in reverse. <manfreds> |
294 | */ | 294 | */ |
295 | for(i = 0; i < count && !error; i++) { | 295 | for (i = 0; i < count && !error; i++) { |
296 | j = limit-1-i; | 296 | j = limit-1-i; |
297 | if (j + log_buf_len < log_end) | 297 | if (j + log_buf_len < log_end) |
298 | break; | 298 | break; |
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) | |||
306 | if (error) | 306 | if (error) |
307 | break; | 307 | break; |
308 | error = i; | 308 | error = i; |
309 | if(i != count) { | 309 | if (i != count) { |
310 | int offset = count-error; | 310 | int offset = count-error; |
311 | /* buffer overflow during copy, correct user buffer. */ | 311 | /* buffer overflow during copy, correct user buffer. */ |
312 | for(i=0;i<error;i++) { | 312 | for (i = 0; i < error; i++) { |
313 | if (__get_user(c,&buf[i+offset]) || | 313 | if (__get_user(c,&buf[i+offset]) || |
314 | __put_user(c,&buf[i])) { | 314 | __put_user(c,&buf[i])) { |
315 | error = -EFAULT; | 315 | error = -EFAULT; |
@@ -351,7 +351,7 @@ out: | |||
351 | return error; | 351 | return error; |
352 | } | 352 | } |
353 | 353 | ||
354 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 354 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
355 | { | 355 | { |
356 | return do_syslog(type, buf, len); | 356 | return do_syslog(type, buf, len); |
357 | } | 357 | } |
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
404 | cur_index = start; | 404 | cur_index = start; |
405 | start_print = start; | 405 | start_print = start; |
406 | while (cur_index != end) { | 406 | while (cur_index != end) { |
407 | if ( msg_level < 0 && | 407 | if (msg_level < 0 && ((end - cur_index) > 2) && |
408 | ((end - cur_index) > 2) && | 408 | LOG_BUF(cur_index + 0) == '<' && |
409 | LOG_BUF(cur_index + 0) == '<' && | 409 | LOG_BUF(cur_index + 1) >= '0' && |
410 | LOG_BUF(cur_index + 1) >= '0' && | 410 | LOG_BUF(cur_index + 1) <= '7' && |
411 | LOG_BUF(cur_index + 1) <= '7' && | 411 | LOG_BUF(cur_index + 2) == '>') { |
412 | LOG_BUF(cur_index + 2) == '>') | ||
413 | { | ||
414 | msg_level = LOG_BUF(cur_index + 1) - '0'; | 412 | msg_level = LOG_BUF(cur_index + 1) - '0'; |
415 | cur_index += 3; | 413 | cur_index += 3; |
416 | start_print = cur_index; | 414 | start_print = cur_index; |
417 | } | 415 | } |
418 | while (cur_index != end) { | 416 | while (cur_index != end) { |
419 | char c = LOG_BUF(cur_index); | 417 | char c = LOG_BUF(cur_index); |
420 | cur_index++; | ||
421 | 418 | ||
419 | cur_index++; | ||
422 | if (c == '\n') { | 420 | if (c == '\n') { |
423 | if (msg_level < 0) { | 421 | if (msg_level < 0) { |
424 | /* | 422 | /* |
@@ -461,7 +459,7 @@ static void zap_locks(void) | |||
461 | static unsigned long oops_timestamp; | 459 | static unsigned long oops_timestamp; |
462 | 460 | ||
463 | if (time_after_eq(jiffies, oops_timestamp) && | 461 | if (time_after_eq(jiffies, oops_timestamp) && |
464 | !time_after(jiffies, oops_timestamp + 30*HZ)) | 462 | !time_after(jiffies, oops_timestamp + 30 * HZ)) |
465 | return; | 463 | return; |
466 | 464 | ||
467 | oops_timestamp = jiffies; | 465 | oops_timestamp = jiffies; |
@@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
495 | 493 | ||
496 | /* | 494 | /* |
497 | * This is printk. It can be called from any context. We want it to work. | 495 | * This is printk. It can be called from any context. We want it to work. |
498 | * | 496 | * |
499 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 497 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and |
500 | * call the console drivers. If we fail to get the semaphore we place the output | 498 | * call the console drivers. If we fail to get the semaphore we place the output |
501 | * into the log buffer and return. The current holder of the console_sem will | 499 | * into the log buffer and return. The current holder of the console_sem will |
@@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk); | |||
639 | 637 | ||
640 | #else | 638 | #else |
641 | 639 | ||
642 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 640 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
643 | { | 641 | { |
644 | return 0; | 642 | return 0; |
645 | } | 643 | } |
646 | 644 | ||
647 | int do_syslog(int type, char __user * buf, int len) { return 0; } | 645 | int do_syslog(int type, char __user *buf, int len) |
648 | static void call_console_drivers(unsigned long start, unsigned long end) {} | 646 | { |
647 | return 0; | ||
648 | } | ||
649 | |||
650 | static void call_console_drivers(unsigned long start, unsigned long end) | ||
651 | { | ||
652 | } | ||
649 | 653 | ||
650 | #endif | 654 | #endif |
651 | 655 | ||
@@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start); | |||
851 | * print any messages that were printed by the kernel before the | 855 | * print any messages that were printed by the kernel before the |
852 | * console driver was initialized. | 856 | * console driver was initialized. |
853 | */ | 857 | */ |
854 | void register_console(struct console * console) | 858 | void register_console(struct console *console) |
855 | { | 859 | { |
856 | int i; | 860 | int i; |
857 | unsigned long flags; | 861 | unsigned long flags; |
858 | 862 | ||
859 | if (preferred_console < 0) | 863 | if (preferred_console < 0) |
@@ -878,7 +882,8 @@ void register_console(struct console * console) | |||
878 | * See if this console matches one we selected on | 882 | * See if this console matches one we selected on |
879 | * the command line. | 883 | * the command line. |
880 | */ | 884 | */ |
881 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { | 885 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; |
886 | i++) { | ||
882 | if (strcmp(console_cmdline[i].name, console->name) != 0) | 887 | if (strcmp(console_cmdline[i].name, console->name) != 0) |
883 | continue; | 888 | continue; |
884 | if (console->index >= 0 && | 889 | if (console->index >= 0 && |
@@ -933,9 +938,9 @@ void register_console(struct console * console) | |||
933 | } | 938 | } |
934 | EXPORT_SYMBOL(register_console); | 939 | EXPORT_SYMBOL(register_console); |
935 | 940 | ||
936 | int unregister_console(struct console * console) | 941 | int unregister_console(struct console *console) |
937 | { | 942 | { |
938 | struct console *a,*b; | 943 | struct console *a, *b; |
939 | int res = 1; | 944 | int res = 1; |
940 | 945 | ||
941 | acquire_console_sem(); | 946 | acquire_console_sem(); |
@@ -949,10 +954,10 @@ int unregister_console(struct console * console) | |||
949 | b->next = a->next; | 954 | b->next = a->next; |
950 | res = 0; | 955 | res = 0; |
951 | break; | 956 | break; |
952 | } | 957 | } |
953 | } | 958 | } |
954 | } | 959 | } |
955 | 960 | ||
956 | /* If last console is removed, we re-enable picking the first | 961 | /* If last console is removed, we re-enable picking the first |
957 | * one that gets registered. Without that, pmac early boot console | 962 | * one that gets registered. Without that, pmac early boot console |
958 | * would prevent fbcon from taking over. | 963 | * would prevent fbcon from taking over. |
@@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) | |||
994 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | 999 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) |
995 | { | 1000 | { |
996 | static DEFINE_SPINLOCK(ratelimit_lock); | 1001 | static DEFINE_SPINLOCK(ratelimit_lock); |
997 | static unsigned long toks = 10*5*HZ; | 1002 | static unsigned long toks = 10 * 5 * HZ; |
998 | static unsigned long last_msg; | 1003 | static unsigned long last_msg; |
999 | static int missed; | 1004 | static int missed; |
1000 | unsigned long flags; | 1005 | unsigned long flags; |
@@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
1007 | toks = ratelimit_burst * ratelimit_jiffies; | 1012 | toks = ratelimit_burst * ratelimit_jiffies; |
1008 | if (toks >= ratelimit_jiffies) { | 1013 | if (toks >= ratelimit_jiffies) { |
1009 | int lost = missed; | 1014 | int lost = missed; |
1015 | |||
1010 | missed = 0; | 1016 | missed = 0; |
1011 | toks -= ratelimit_jiffies; | 1017 | toks -= ratelimit_jiffies; |
1012 | spin_unlock_irqrestore(&ratelimit_lock, flags); | 1018 | spin_unlock_irqrestore(&ratelimit_lock, flags); |
@@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
1021 | EXPORT_SYMBOL(__printk_ratelimit); | 1027 | EXPORT_SYMBOL(__printk_ratelimit); |
1022 | 1028 | ||
1023 | /* minimum time in jiffies between messages */ | 1029 | /* minimum time in jiffies between messages */ |
1024 | int printk_ratelimit_jiffies = 5*HZ; | 1030 | int printk_ratelimit_jiffies = 5 * HZ; |
1025 | 1031 | ||
1026 | /* number of messages we send before ratelimiting */ | 1032 | /* number of messages we send before ratelimiting */ |
1027 | int printk_ratelimit_burst = 10; | 1033 | int printk_ratelimit_burst = 10; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 019e04ec065a..863eee8bff47 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) | |||
56 | signal_wake_up(child, 1); | 56 | signal_wake_up(child, 1); |
57 | } | 57 | } |
58 | } | 58 | } |
59 | if (child->signal->flags & SIGNAL_GROUP_EXIT) { | ||
60 | sigaddset(&child->pending.signal, SIGKILL); | ||
61 | signal_wake_up(child, 1); | ||
62 | } | ||
59 | spin_unlock(&child->sighand->siglock); | 63 | spin_unlock(&child->sighand->siglock); |
60 | } | 64 | } |
61 | 65 | ||
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) | |||
77 | SET_LINKS(child); | 81 | SET_LINKS(child); |
78 | } | 82 | } |
79 | 83 | ||
80 | if (child->state == TASK_TRACED) | 84 | ptrace_untrace(child); |
81 | ptrace_untrace(child); | ||
82 | } | 85 | } |
83 | 86 | ||
84 | /* | 87 | /* |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2559d4b8f23f..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -154,6 +154,15 @@ void fastcall call_rcu_bh(struct rcu_head *head, | |||
154 | } | 154 | } |
155 | 155 | ||
156 | /* | 156 | /* |
157 | * Return the number of RCU batches processed thus far. Useful | ||
158 | * for debug and statistics. | ||
159 | */ | ||
160 | long rcu_batches_completed(void) | ||
161 | { | ||
162 | return rcu_ctrlblk.completed; | ||
163 | } | ||
164 | |||
165 | /* | ||
157 | * Invoke the completed RCU callbacks. They are expected to be in | 166 | * Invoke the completed RCU callbacks. They are expected to be in |
158 | * a per-cpu list. | 167 | * a per-cpu list. |
159 | */ | 168 | */ |
@@ -501,6 +510,7 @@ void synchronize_kernel(void) | |||
501 | } | 510 | } |
502 | 511 | ||
503 | module_param(maxbatch, int, 0); | 512 | module_param(maxbatch, int, 0); |
513 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
504 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ | 514 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ |
505 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 515 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ |
506 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 516 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..9b58f1eff3ca --- /dev/null +++ b/kernel/rcutorture.c | |||
@@ -0,0 +1,492 @@ | |||
1 | /* | ||
2 | * Read-Copy Update /proc-based torture test facility | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2005 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * | ||
22 | * See also: Documentation/RCU/torture.txt | ||
23 | */ | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kthread.h> | ||
29 | #include <linux/err.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/smp.h> | ||
32 | #include <linux/rcupdate.h> | ||
33 | #include <linux/interrupt.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <asm/atomic.h> | ||
36 | #include <linux/bitops.h> | ||
37 | #include <linux/module.h> | ||
38 | #include <linux/completion.h> | ||
39 | #include <linux/moduleparam.h> | ||
40 | #include <linux/percpu.h> | ||
41 | #include <linux/notifier.h> | ||
42 | #include <linux/rcuref.h> | ||
43 | #include <linux/cpu.h> | ||
44 | #include <linux/random.h> | ||
45 | #include <linux/delay.h> | ||
46 | #include <linux/byteorder/swabb.h> | ||
47 | #include <linux/stat.h> | ||
48 | |||
49 | MODULE_LICENSE("GPL"); | ||
50 | |||
51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | ||
52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | ||
53 | /* Defaults to "only at end of test". */ | ||
54 | static int verbose = 0; /* Print more debug info. */ | ||
55 | |||
56 | MODULE_PARM(nreaders, "i"); | ||
57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
58 | MODULE_PARM(stat_interval, "i"); | ||
59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
60 | MODULE_PARM(verbose, "i"); | ||
61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
62 | #define TORTURE_FLAG "rcutorture: " | ||
63 | #define PRINTK_STRING(s) \ | ||
64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
65 | #define VERBOSE_PRINTK_STRING(s) \ | ||
66 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
67 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | ||
68 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | ||
69 | |||
70 | static char printk_buf[4096]; | ||
71 | |||
72 | static int nrealreaders; | ||
73 | static struct task_struct *writer_task; | ||
74 | static struct task_struct **reader_tasks; | ||
75 | static struct task_struct *stats_task; | ||
76 | |||
77 | #define RCU_TORTURE_PIPE_LEN 10 | ||
78 | |||
79 | struct rcu_torture { | ||
80 | struct rcu_head rtort_rcu; | ||
81 | int rtort_pipe_count; | ||
82 | struct list_head rtort_free; | ||
83 | }; | ||
84 | |||
85 | static int fullstop = 0; /* stop generating callbacks at test end. */ | ||
86 | static LIST_HEAD(rcu_torture_freelist); | ||
87 | static struct rcu_torture *rcu_torture_current = NULL; | ||
88 | static long rcu_torture_current_version = 0; | ||
89 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | ||
90 | static DEFINE_SPINLOCK(rcu_torture_lock); | ||
91 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | ||
92 | { 0 }; | ||
93 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | ||
94 | { 0 }; | ||
95 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | ||
96 | atomic_t n_rcu_torture_alloc; | ||
97 | atomic_t n_rcu_torture_alloc_fail; | ||
98 | atomic_t n_rcu_torture_free; | ||
99 | |||
100 | /* | ||
101 | * Allocate an element from the rcu_tortures pool. | ||
102 | */ | ||
103 | struct rcu_torture * | ||
104 | rcu_torture_alloc(void) | ||
105 | { | ||
106 | struct list_head *p; | ||
107 | |||
108 | spin_lock(&rcu_torture_lock); | ||
109 | if (list_empty(&rcu_torture_freelist)) { | ||
110 | atomic_inc(&n_rcu_torture_alloc_fail); | ||
111 | spin_unlock(&rcu_torture_lock); | ||
112 | return NULL; | ||
113 | } | ||
114 | atomic_inc(&n_rcu_torture_alloc); | ||
115 | p = rcu_torture_freelist.next; | ||
116 | list_del_init(p); | ||
117 | spin_unlock(&rcu_torture_lock); | ||
118 | return container_of(p, struct rcu_torture, rtort_free); | ||
119 | } | ||
120 | |||
121 | /* | ||
122 | * Free an element to the rcu_tortures pool. | ||
123 | */ | ||
124 | static void | ||
125 | rcu_torture_free(struct rcu_torture *p) | ||
126 | { | ||
127 | atomic_inc(&n_rcu_torture_free); | ||
128 | spin_lock(&rcu_torture_lock); | ||
129 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | ||
130 | spin_unlock(&rcu_torture_lock); | ||
131 | } | ||
132 | |||
133 | static void | ||
134 | rcu_torture_cb(struct rcu_head *p) | ||
135 | { | ||
136 | int i; | ||
137 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
138 | |||
139 | if (fullstop) { | ||
140 | /* Test is ending, just drop callbacks on the floor. */ | ||
141 | /* The next initialization will pick up the pieces. */ | ||
142 | return; | ||
143 | } | ||
144 | i = rp->rtort_pipe_count; | ||
145 | if (i > RCU_TORTURE_PIPE_LEN) | ||
146 | i = RCU_TORTURE_PIPE_LEN; | ||
147 | atomic_inc(&rcu_torture_wcount[i]); | ||
148 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) | ||
149 | rcu_torture_free(rp); | ||
150 | else | ||
151 | call_rcu(p, rcu_torture_cb); | ||
152 | } | ||
153 | |||
154 | struct rcu_random_state { | ||
155 | unsigned long rrs_state; | ||
156 | unsigned long rrs_count; | ||
157 | }; | ||
158 | |||
159 | #define RCU_RANDOM_MULT 39916801 /* prime */ | ||
160 | #define RCU_RANDOM_ADD 479001701 /* prime */ | ||
161 | #define RCU_RANDOM_REFRESH 10000 | ||
162 | |||
163 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | ||
164 | |||
165 | /* | ||
166 | * Crude but fast random-number generator. Uses a linear congruential | ||
167 | * generator, with occasional help from get_random_bytes(). | ||
168 | */ | ||
169 | static long | ||
170 | rcu_random(struct rcu_random_state *rrsp) | ||
171 | { | ||
172 | long refresh; | ||
173 | |||
174 | if (--rrsp->rrs_count < 0) { | ||
175 | get_random_bytes(&refresh, sizeof(refresh)); | ||
176 | rrsp->rrs_state += refresh; | ||
177 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | ||
178 | } | ||
179 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | ||
180 | return swahw32(rrsp->rrs_state); | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * RCU torture writer kthread. Repeatedly substitutes a new structure | ||
185 | * for that pointed to by rcu_torture_current, freeing the old structure | ||
186 | * after a series of grace periods (the "pipeline"). | ||
187 | */ | ||
188 | static int | ||
189 | rcu_torture_writer(void *arg) | ||
190 | { | ||
191 | int i; | ||
192 | long oldbatch = rcu_batches_completed(); | ||
193 | struct rcu_torture *rp; | ||
194 | struct rcu_torture *old_rp; | ||
195 | static DEFINE_RCU_RANDOM(rand); | ||
196 | |||
197 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | ||
198 | do { | ||
199 | schedule_timeout_uninterruptible(1); | ||
200 | if (rcu_batches_completed() == oldbatch) | ||
201 | continue; | ||
202 | if ((rp = rcu_torture_alloc()) == NULL) | ||
203 | continue; | ||
204 | rp->rtort_pipe_count = 0; | ||
205 | udelay(rcu_random(&rand) & 0x3ff); | ||
206 | old_rp = rcu_torture_current; | ||
207 | rcu_assign_pointer(rcu_torture_current, rp); | ||
208 | smp_wmb(); | ||
209 | if (old_rp != NULL) { | ||
210 | i = old_rp->rtort_pipe_count; | ||
211 | if (i > RCU_TORTURE_PIPE_LEN) | ||
212 | i = RCU_TORTURE_PIPE_LEN; | ||
213 | atomic_inc(&rcu_torture_wcount[i]); | ||
214 | old_rp->rtort_pipe_count++; | ||
215 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | ||
216 | } | ||
217 | rcu_torture_current_version++; | ||
218 | oldbatch = rcu_batches_completed(); | ||
219 | } while (!kthread_should_stop() && !fullstop); | ||
220 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | ||
221 | while (!kthread_should_stop()) | ||
222 | schedule_timeout_uninterruptible(1); | ||
223 | return 0; | ||
224 | } | ||
225 | |||
226 | /* | ||
227 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | ||
228 | * incrementing the corresponding element of the pipeline array. The | ||
229 | * counter in the element should never be greater than 1, otherwise, the | ||
230 | * RCU implementation is broken. | ||
231 | */ | ||
232 | static int | ||
233 | rcu_torture_reader(void *arg) | ||
234 | { | ||
235 | int completed; | ||
236 | DEFINE_RCU_RANDOM(rand); | ||
237 | struct rcu_torture *p; | ||
238 | int pipe_count; | ||
239 | |||
240 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | ||
241 | do { | ||
242 | rcu_read_lock(); | ||
243 | completed = rcu_batches_completed(); | ||
244 | p = rcu_dereference(rcu_torture_current); | ||
245 | if (p == NULL) { | ||
246 | /* Wait for rcu_torture_writer to get underway */ | ||
247 | rcu_read_unlock(); | ||
248 | schedule_timeout_interruptible(HZ); | ||
249 | continue; | ||
250 | } | ||
251 | udelay(rcu_random(&rand) & 0x7f); | ||
252 | preempt_disable(); | ||
253 | pipe_count = p->rtort_pipe_count; | ||
254 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | ||
255 | /* Should not happen, but... */ | ||
256 | pipe_count = RCU_TORTURE_PIPE_LEN; | ||
257 | } | ||
258 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | ||
259 | completed = rcu_batches_completed() - completed; | ||
260 | if (completed > RCU_TORTURE_PIPE_LEN) { | ||
261 | /* Should not happen, but... */ | ||
262 | completed = RCU_TORTURE_PIPE_LEN; | ||
263 | } | ||
264 | ++__get_cpu_var(rcu_torture_batch)[completed]; | ||
265 | preempt_enable(); | ||
266 | rcu_read_unlock(); | ||
267 | schedule(); | ||
268 | } while (!kthread_should_stop() && !fullstop); | ||
269 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | ||
270 | while (!kthread_should_stop()) | ||
271 | schedule_timeout_uninterruptible(1); | ||
272 | return 0; | ||
273 | } | ||
274 | |||
275 | /* | ||
276 | * Create an RCU-torture statistics message in the specified buffer. | ||
277 | */ | ||
278 | static int | ||
279 | rcu_torture_printk(char *page) | ||
280 | { | ||
281 | int cnt = 0; | ||
282 | int cpu; | ||
283 | int i; | ||
284 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
285 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
286 | |||
287 | for_each_cpu(cpu) { | ||
288 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
289 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | ||
290 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | ||
291 | } | ||
292 | } | ||
293 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | ||
294 | if (pipesummary[i] != 0) | ||
295 | break; | ||
296 | } | ||
297 | cnt += sprintf(&page[cnt], "rcutorture: "); | ||
298 | cnt += sprintf(&page[cnt], | ||
299 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d", | ||
300 | rcu_torture_current, | ||
301 | rcu_torture_current_version, | ||
302 | list_empty(&rcu_torture_freelist), | ||
303 | atomic_read(&n_rcu_torture_alloc), | ||
304 | atomic_read(&n_rcu_torture_alloc_fail), | ||
305 | atomic_read(&n_rcu_torture_free)); | ||
306 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
307 | if (i > 1) | ||
308 | cnt += sprintf(&page[cnt], "!!! "); | ||
309 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | ||
310 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
311 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | ||
312 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
313 | cnt += sprintf(&page[cnt], "Reader Batch: "); | ||
314 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
315 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | ||
316 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
317 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | ||
318 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
319 | cnt += sprintf(&page[cnt], " %d", | ||
320 | atomic_read(&rcu_torture_wcount[i])); | ||
321 | } | ||
322 | cnt += sprintf(&page[cnt], "\n"); | ||
323 | return cnt; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Print torture statistics. Caller must ensure that there is only | ||
328 | * one call to this function at a given time!!! This is normally | ||
329 | * accomplished by relying on the module system to only have one copy | ||
330 | * of the module loaded, and then by giving the rcu_torture_stats | ||
331 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
332 | * thread is not running). | ||
333 | */ | ||
334 | static void | ||
335 | rcu_torture_stats_print(void) | ||
336 | { | ||
337 | int cnt; | ||
338 | |||
339 | cnt = rcu_torture_printk(printk_buf); | ||
340 | printk(KERN_ALERT "%s", printk_buf); | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Periodically prints torture statistics, if periodic statistics printing | ||
345 | * was specified via the stat_interval module parameter. | ||
346 | * | ||
347 | * No need to worry about fullstop here, since this one doesn't reference | ||
348 | * volatile state or register callbacks. | ||
349 | */ | ||
350 | static int | ||
351 | rcu_torture_stats(void *arg) | ||
352 | { | ||
353 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | ||
354 | do { | ||
355 | schedule_timeout_interruptible(stat_interval * HZ); | ||
356 | rcu_torture_stats_print(); | ||
357 | } while (!kthread_should_stop()); | ||
358 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | ||
359 | return 0; | ||
360 | } | ||
361 | |||
362 | static void | ||
363 | rcu_torture_cleanup(void) | ||
364 | { | ||
365 | int i; | ||
366 | |||
367 | fullstop = 1; | ||
368 | if (writer_task != NULL) { | ||
369 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | ||
370 | kthread_stop(writer_task); | ||
371 | } | ||
372 | writer_task = NULL; | ||
373 | |||
374 | if (reader_tasks != NULL) { | ||
375 | for (i = 0; i < nrealreaders; i++) { | ||
376 | if (reader_tasks[i] != NULL) { | ||
377 | VERBOSE_PRINTK_STRING( | ||
378 | "Stopping rcu_torture_reader task"); | ||
379 | kthread_stop(reader_tasks[i]); | ||
380 | } | ||
381 | reader_tasks[i] = NULL; | ||
382 | } | ||
383 | kfree(reader_tasks); | ||
384 | reader_tasks = NULL; | ||
385 | } | ||
386 | rcu_torture_current = NULL; | ||
387 | |||
388 | if (stats_task != NULL) { | ||
389 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | ||
390 | kthread_stop(stats_task); | ||
391 | } | ||
392 | stats_task = NULL; | ||
393 | |||
394 | /* Wait for all RCU callbacks to fire. */ | ||
395 | |||
396 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
397 | synchronize_rcu(); | ||
398 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | ||
399 | PRINTK_STRING("--- End of test"); | ||
400 | } | ||
401 | |||
402 | static int | ||
403 | rcu_torture_init(void) | ||
404 | { | ||
405 | int i; | ||
406 | int cpu; | ||
407 | int firsterr = 0; | ||
408 | |||
409 | /* Process args and tell the world that the torturer is on the job. */ | ||
410 | |||
411 | if (nreaders >= 0) | ||
412 | nrealreaders = nreaders; | ||
413 | else | ||
414 | nrealreaders = 2 * num_online_cpus(); | ||
415 | printk(KERN_ALERT TORTURE_FLAG | ||
416 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | ||
417 | nrealreaders, stat_interval, verbose); | ||
418 | fullstop = 0; | ||
419 | |||
420 | /* Set up the freelist. */ | ||
421 | |||
422 | INIT_LIST_HEAD(&rcu_torture_freelist); | ||
423 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { | ||
424 | list_add_tail(&rcu_tortures[i].rtort_free, | ||
425 | &rcu_torture_freelist); | ||
426 | } | ||
427 | |||
428 | /* Initialize the statistics so that each run gets its own numbers. */ | ||
429 | |||
430 | rcu_torture_current = NULL; | ||
431 | rcu_torture_current_version = 0; | ||
432 | atomic_set(&n_rcu_torture_alloc, 0); | ||
433 | atomic_set(&n_rcu_torture_alloc_fail, 0); | ||
434 | atomic_set(&n_rcu_torture_free, 0); | ||
435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
436 | atomic_set(&rcu_torture_wcount[i], 0); | ||
437 | for_each_cpu(cpu) { | ||
438 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
439 | per_cpu(rcu_torture_count, cpu)[i] = 0; | ||
440 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | ||
441 | } | ||
442 | } | ||
443 | |||
444 | /* Start up the kthreads. */ | ||
445 | |||
446 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | ||
447 | writer_task = kthread_run(rcu_torture_writer, NULL, | ||
448 | "rcu_torture_writer"); | ||
449 | if (IS_ERR(writer_task)) { | ||
450 | firsterr = PTR_ERR(writer_task); | ||
451 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | ||
452 | writer_task = NULL; | ||
453 | goto unwind; | ||
454 | } | ||
455 | reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), | ||
456 | GFP_KERNEL); | ||
457 | if (reader_tasks == NULL) { | ||
458 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | ||
459 | firsterr = -ENOMEM; | ||
460 | goto unwind; | ||
461 | } | ||
462 | for (i = 0; i < nrealreaders; i++) { | ||
463 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | ||
464 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | ||
465 | "rcu_torture_reader"); | ||
466 | if (IS_ERR(reader_tasks[i])) { | ||
467 | firsterr = PTR_ERR(reader_tasks[i]); | ||
468 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | ||
469 | reader_tasks[i] = NULL; | ||
470 | goto unwind; | ||
471 | } | ||
472 | } | ||
473 | if (stat_interval > 0) { | ||
474 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | ||
475 | stats_task = kthread_run(rcu_torture_stats, NULL, | ||
476 | "rcu_torture_stats"); | ||
477 | if (IS_ERR(stats_task)) { | ||
478 | firsterr = PTR_ERR(stats_task); | ||
479 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | ||
480 | stats_task = NULL; | ||
481 | goto unwind; | ||
482 | } | ||
483 | } | ||
484 | return 0; | ||
485 | |||
486 | unwind: | ||
487 | rcu_torture_cleanup(); | ||
488 | return firsterr; | ||
489 | } | ||
490 | |||
491 | module_init(rcu_torture_init); | ||
492 | module_exit(rcu_torture_cleanup); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 1e5cafdf4e27..340dd238c16d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
2511 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 2511 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
2512 | /* Account for system time used */ | 2512 | /* Account for system time used */ |
2513 | acct_update_integrals(p); | 2513 | acct_update_integrals(p); |
2514 | /* Update rss highwater mark */ | ||
2515 | update_mem_hiwater(p); | ||
2516 | } | 2514 | } |
2517 | 2515 | ||
2518 | /* | 2516 | /* |
@@ -3879,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map); | |||
3879 | 3877 | ||
3880 | #ifndef CONFIG_SMP | 3878 | #ifndef CONFIG_SMP |
3881 | cpumask_t cpu_online_map = CPU_MASK_ALL; | 3879 | cpumask_t cpu_online_map = CPU_MASK_ALL; |
3882 | EXPORT_SYMBOL_GPL(cpu_online_map); | ||
3883 | cpumask_t cpu_possible_map = CPU_MASK_ALL; | 3880 | cpumask_t cpu_possible_map = CPU_MASK_ALL; |
3884 | #endif | 3881 | #endif |
3885 | 3882 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index f2b96b08fb44..1bf3c39d6109 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, | |||
277 | } else { | 277 | } else { |
278 | INIT_LIST_HEAD(&q->list); | 278 | INIT_LIST_HEAD(&q->list); |
279 | q->flags = 0; | 279 | q->flags = 0; |
280 | q->lock = NULL; | ||
281 | q->user = get_uid(t->user); | 280 | q->user = get_uid(t->user); |
282 | } | 281 | } |
283 | return(q); | 282 | return(q); |
@@ -406,6 +405,8 @@ void __exit_signal(struct task_struct *tsk) | |||
406 | 405 | ||
407 | void exit_signal(struct task_struct *tsk) | 406 | void exit_signal(struct task_struct *tsk) |
408 | { | 407 | { |
408 | atomic_dec(&tsk->signal->live); | ||
409 | |||
409 | write_lock_irq(&tasklist_lock); | 410 | write_lock_irq(&tasklist_lock); |
410 | __exit_signal(tsk); | 411 | __exit_signal(tsk); |
411 | write_unlock_irq(&tasklist_lock); | 412 | write_unlock_irq(&tasklist_lock); |
@@ -650,8 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
650 | if (!valid_signal(sig)) | 651 | if (!valid_signal(sig)) |
651 | return error; | 652 | return error; |
652 | error = -EPERM; | 653 | error = -EPERM; |
653 | if ((!info || ((unsigned long)info != 1 && | 654 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
654 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
655 | && ((sig != SIGCONT) || | 655 | && ((sig != SIGCONT) || |
656 | (current->signal->session != t->signal->session)) | 656 | (current->signal->session != t->signal->session)) |
657 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 657 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
@@ -788,7 +788,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
788 | * fast-pathed signals for kernel-internal things like SIGSTOP | 788 | * fast-pathed signals for kernel-internal things like SIGSTOP |
789 | * or SIGKILL. | 789 | * or SIGKILL. |
790 | */ | 790 | */ |
791 | if ((unsigned long)info == 2) | 791 | if (info == SEND_SIG_FORCED) |
792 | goto out_set; | 792 | goto out_set; |
793 | 793 | ||
794 | /* Real-time signals must be queued if sent by sigqueue, or | 794 | /* Real-time signals must be queued if sent by sigqueue, or |
@@ -800,19 +800,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
800 | pass on the info struct. */ | 800 | pass on the info struct. */ |
801 | 801 | ||
802 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && | 802 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && |
803 | ((unsigned long) info < 2 || | 803 | (is_si_special(info) || |
804 | info->si_code >= 0))); | 804 | info->si_code >= 0))); |
805 | if (q) { | 805 | if (q) { |
806 | list_add_tail(&q->list, &signals->list); | 806 | list_add_tail(&q->list, &signals->list); |
807 | switch ((unsigned long) info) { | 807 | switch ((unsigned long) info) { |
808 | case 0: | 808 | case (unsigned long) SEND_SIG_NOINFO: |
809 | q->info.si_signo = sig; | 809 | q->info.si_signo = sig; |
810 | q->info.si_errno = 0; | 810 | q->info.si_errno = 0; |
811 | q->info.si_code = SI_USER; | 811 | q->info.si_code = SI_USER; |
812 | q->info.si_pid = current->pid; | 812 | q->info.si_pid = current->pid; |
813 | q->info.si_uid = current->uid; | 813 | q->info.si_uid = current->uid; |
814 | break; | 814 | break; |
815 | case 1: | 815 | case (unsigned long) SEND_SIG_PRIV: |
816 | q->info.si_signo = sig; | 816 | q->info.si_signo = sig; |
817 | q->info.si_errno = 0; | 817 | q->info.si_errno = 0; |
818 | q->info.si_code = SI_KERNEL; | 818 | q->info.si_code = SI_KERNEL; |
@@ -823,20 +823,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
823 | copy_siginfo(&q->info, info); | 823 | copy_siginfo(&q->info, info); |
824 | break; | 824 | break; |
825 | } | 825 | } |
826 | } else { | 826 | } else if (!is_si_special(info)) { |
827 | if (sig >= SIGRTMIN && info && (unsigned long)info != 1 | 827 | if (sig >= SIGRTMIN && info->si_code != SI_USER) |
828 | && info->si_code != SI_USER) | ||
829 | /* | 828 | /* |
830 | * Queue overflow, abort. We may abort if the signal was rt | 829 | * Queue overflow, abort. We may abort if the signal was rt |
831 | * and sent by user using something other than kill(). | 830 | * and sent by user using something other than kill(). |
832 | */ | 831 | */ |
833 | return -EAGAIN; | 832 | return -EAGAIN; |
834 | if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) | ||
835 | /* | ||
836 | * Set up a return to indicate that we dropped | ||
837 | * the signal. | ||
838 | */ | ||
839 | ret = info->si_sys_private; | ||
840 | } | 833 | } |
841 | 834 | ||
842 | out_set: | 835 | out_set: |
@@ -857,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
857 | BUG(); | 850 | BUG(); |
858 | assert_spin_locked(&t->sighand->siglock); | 851 | assert_spin_locked(&t->sighand->siglock); |
859 | 852 | ||
860 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
861 | /* | ||
862 | * Set up a return to indicate that we dropped the signal. | ||
863 | */ | ||
864 | ret = info->si_sys_private; | ||
865 | |||
866 | /* Short-circuit ignored signals. */ | 853 | /* Short-circuit ignored signals. */ |
867 | if (sig_ignored(t, sig)) | 854 | if (sig_ignored(t, sig)) |
868 | goto out; | 855 | goto out; |
@@ -892,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
892 | int ret; | 879 | int ret; |
893 | 880 | ||
894 | spin_lock_irqsave(&t->sighand->siglock, flags); | 881 | spin_lock_irqsave(&t->sighand->siglock, flags); |
895 | if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { | 882 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { |
896 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | 883 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; |
884 | } | ||
885 | if (sigismember(&t->blocked, sig)) { | ||
897 | sigdelset(&t->blocked, sig); | 886 | sigdelset(&t->blocked, sig); |
898 | recalc_sigpending_tsk(t); | ||
899 | } | 887 | } |
888 | recalc_sigpending_tsk(t); | ||
900 | ret = specific_send_sig_info(sig, info, t); | 889 | ret = specific_send_sig_info(sig, info, t); |
901 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 890 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
902 | 891 | ||
@@ -906,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
906 | void | 895 | void |
907 | force_sig_specific(int sig, struct task_struct *t) | 896 | force_sig_specific(int sig, struct task_struct *t) |
908 | { | 897 | { |
909 | unsigned long int flags; | 898 | force_sig_info(sig, SEND_SIG_FORCED, t); |
910 | |||
911 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
912 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) | ||
913 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | ||
914 | sigdelset(&t->blocked, sig); | ||
915 | recalc_sigpending_tsk(t); | ||
916 | specific_send_sig_info(sig, (void *)2, t); | ||
917 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
918 | } | 899 | } |
919 | 900 | ||
920 | /* | 901 | /* |
@@ -1049,12 +1030,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1049 | assert_spin_locked(&p->sighand->siglock); | 1030 | assert_spin_locked(&p->sighand->siglock); |
1050 | handle_stop_signal(sig, p); | 1031 | handle_stop_signal(sig, p); |
1051 | 1032 | ||
1052 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
1053 | /* | ||
1054 | * Set up a return to indicate that we dropped the signal. | ||
1055 | */ | ||
1056 | ret = info->si_sys_private; | ||
1057 | |||
1058 | /* Short-circuit ignored signals. */ | 1033 | /* Short-circuit ignored signals. */ |
1059 | if (sig_ignored(p, sig)) | 1034 | if (sig_ignored(p, sig)) |
1060 | return ret; | 1035 | return ret; |
@@ -1107,8 +1082,8 @@ void zap_other_threads(struct task_struct *p) | |||
1107 | if (t != p->group_leader) | 1082 | if (t != p->group_leader) |
1108 | t->exit_signal = -1; | 1083 | t->exit_signal = -1; |
1109 | 1084 | ||
1085 | /* SIGKILL will be handled before any pending SIGSTOP */ | ||
1110 | sigaddset(&t->pending.signal, SIGKILL); | 1086 | sigaddset(&t->pending.signal, SIGKILL); |
1111 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
1112 | signal_wake_up(t, 1); | 1087 | signal_wake_up(t, 1); |
1113 | } | 1088 | } |
1114 | } | 1089 | } |
@@ -1284,10 +1259,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1284 | return ret; | 1259 | return ret; |
1285 | } | 1260 | } |
1286 | 1261 | ||
1262 | #define __si_special(priv) \ | ||
1263 | ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) | ||
1264 | |||
1287 | int | 1265 | int |
1288 | send_sig(int sig, struct task_struct *p, int priv) | 1266 | send_sig(int sig, struct task_struct *p, int priv) |
1289 | { | 1267 | { |
1290 | return send_sig_info(sig, (void*)(long)(priv != 0), p); | 1268 | return send_sig_info(sig, __si_special(priv), p); |
1291 | } | 1269 | } |
1292 | 1270 | ||
1293 | /* | 1271 | /* |
@@ -1307,7 +1285,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1307 | void | 1285 | void |
1308 | force_sig(int sig, struct task_struct *p) | 1286 | force_sig(int sig, struct task_struct *p) |
1309 | { | 1287 | { |
1310 | force_sig_info(sig, (void*)1L, p); | 1288 | force_sig_info(sig, SEND_SIG_PRIV, p); |
1311 | } | 1289 | } |
1312 | 1290 | ||
1313 | /* | 1291 | /* |
@@ -1332,13 +1310,13 @@ force_sigsegv(int sig, struct task_struct *p) | |||
1332 | int | 1310 | int |
1333 | kill_pg(pid_t pgrp, int sig, int priv) | 1311 | kill_pg(pid_t pgrp, int sig, int priv) |
1334 | { | 1312 | { |
1335 | return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); | 1313 | return kill_pg_info(sig, __si_special(priv), pgrp); |
1336 | } | 1314 | } |
1337 | 1315 | ||
1338 | int | 1316 | int |
1339 | kill_proc(pid_t pid, int sig, int priv) | 1317 | kill_proc(pid_t pid, int sig, int priv) |
1340 | { | 1318 | { |
1341 | return kill_proc_info(sig, (void *)(long)(priv != 0), pid); | 1319 | return kill_proc_info(sig, __si_special(priv), pid); |
1342 | } | 1320 | } |
1343 | 1321 | ||
1344 | /* | 1322 | /* |
@@ -1369,11 +1347,12 @@ void sigqueue_free(struct sigqueue *q) | |||
1369 | * pending queue. | 1347 | * pending queue. |
1370 | */ | 1348 | */ |
1371 | if (unlikely(!list_empty(&q->list))) { | 1349 | if (unlikely(!list_empty(&q->list))) { |
1372 | read_lock(&tasklist_lock); | 1350 | spinlock_t *lock = ¤t->sighand->siglock; |
1373 | spin_lock_irqsave(q->lock, flags); | 1351 | read_lock(&tasklist_lock); |
1352 | spin_lock_irqsave(lock, flags); | ||
1374 | if (!list_empty(&q->list)) | 1353 | if (!list_empty(&q->list)) |
1375 | list_del_init(&q->list); | 1354 | list_del_init(&q->list); |
1376 | spin_unlock_irqrestore(q->lock, flags); | 1355 | spin_unlock_irqrestore(lock, flags); |
1377 | read_unlock(&tasklist_lock); | 1356 | read_unlock(&tasklist_lock); |
1378 | } | 1357 | } |
1379 | q->flags &= ~SIGQUEUE_PREALLOC; | 1358 | q->flags &= ~SIGQUEUE_PREALLOC; |
@@ -1412,7 +1391,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1412 | goto out; | 1391 | goto out; |
1413 | } | 1392 | } |
1414 | 1393 | ||
1415 | q->lock = &p->sighand->siglock; | ||
1416 | list_add_tail(&q->list, &p->pending.list); | 1394 | list_add_tail(&q->list, &p->pending.list); |
1417 | sigaddset(&p->pending.signal, sig); | 1395 | sigaddset(&p->pending.signal, sig); |
1418 | if (!sigismember(&p->blocked, sig)) | 1396 | if (!sigismember(&p->blocked, sig)) |
@@ -1460,7 +1438,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1460 | * We always use the shared queue for process-wide signals, | 1438 | * We always use the shared queue for process-wide signals, |
1461 | * to avoid several races. | 1439 | * to avoid several races. |
1462 | */ | 1440 | */ |
1463 | q->lock = &p->sighand->siglock; | ||
1464 | list_add_tail(&q->list, &p->signal->shared_pending.list); | 1441 | list_add_tail(&q->list, &p->signal->shared_pending.list); |
1465 | sigaddset(&p->signal->shared_pending.signal, sig); | 1442 | sigaddset(&p->signal->shared_pending.signal, sig); |
1466 | 1443 | ||
@@ -1879,9 +1856,9 @@ relock: | |||
1879 | /* Let the debugger run. */ | 1856 | /* Let the debugger run. */ |
1880 | ptrace_stop(signr, signr, info); | 1857 | ptrace_stop(signr, signr, info); |
1881 | 1858 | ||
1882 | /* We're back. Did the debugger cancel the sig? */ | 1859 | /* We're back. Did the debugger cancel the sig or group_exit? */ |
1883 | signr = current->exit_code; | 1860 | signr = current->exit_code; |
1884 | if (signr == 0) | 1861 | if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) |
1885 | continue; | 1862 | continue; |
1886 | 1863 | ||
1887 | current->exit_code = 0; | 1864 | current->exit_code = 0; |
@@ -2283,26 +2260,13 @@ sys_kill(int pid, int sig) | |||
2283 | return kill_something_info(sig, &info, pid); | 2260 | return kill_something_info(sig, &info, pid); |
2284 | } | 2261 | } |
2285 | 2262 | ||
2286 | /** | 2263 | static int do_tkill(int tgid, int pid, int sig) |
2287 | * sys_tgkill - send signal to one specific thread | ||
2288 | * @tgid: the thread group ID of the thread | ||
2289 | * @pid: the PID of the thread | ||
2290 | * @sig: signal to be sent | ||
2291 | * | ||
2292 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
2293 | * exists but it's not belonging to the target process anymore. This | ||
2294 | * method solves the problem of threads exiting and PIDs getting reused. | ||
2295 | */ | ||
2296 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
2297 | { | 2264 | { |
2298 | struct siginfo info; | ||
2299 | int error; | 2265 | int error; |
2266 | struct siginfo info; | ||
2300 | struct task_struct *p; | 2267 | struct task_struct *p; |
2301 | 2268 | ||
2302 | /* This is only valid for single tasks */ | 2269 | error = -ESRCH; |
2303 | if (pid <= 0 || tgid <= 0) | ||
2304 | return -EINVAL; | ||
2305 | |||
2306 | info.si_signo = sig; | 2270 | info.si_signo = sig; |
2307 | info.si_errno = 0; | 2271 | info.si_errno = 0; |
2308 | info.si_code = SI_TKILL; | 2272 | info.si_code = SI_TKILL; |
@@ -2311,8 +2275,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
2311 | 2275 | ||
2312 | read_lock(&tasklist_lock); | 2276 | read_lock(&tasklist_lock); |
2313 | p = find_task_by_pid(pid); | 2277 | p = find_task_by_pid(pid); |
2314 | error = -ESRCH; | 2278 | if (p && (tgid <= 0 || p->tgid == tgid)) { |
2315 | if (p && (p->tgid == tgid)) { | ||
2316 | error = check_kill_permission(sig, &info, p); | 2279 | error = check_kill_permission(sig, &info, p); |
2317 | /* | 2280 | /* |
2318 | * The null signal is a permissions and process existence | 2281 | * The null signal is a permissions and process existence |
@@ -2326,47 +2289,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
2326 | } | 2289 | } |
2327 | } | 2290 | } |
2328 | read_unlock(&tasklist_lock); | 2291 | read_unlock(&tasklist_lock); |
2292 | |||
2329 | return error; | 2293 | return error; |
2330 | } | 2294 | } |
2331 | 2295 | ||
2296 | /** | ||
2297 | * sys_tgkill - send signal to one specific thread | ||
2298 | * @tgid: the thread group ID of the thread | ||
2299 | * @pid: the PID of the thread | ||
2300 | * @sig: signal to be sent | ||
2301 | * | ||
2302 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
2303 | * exists but it's not belonging to the target process anymore. This | ||
2304 | * method solves the problem of threads exiting and PIDs getting reused. | ||
2305 | */ | ||
2306 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
2307 | { | ||
2308 | /* This is only valid for single tasks */ | ||
2309 | if (pid <= 0 || tgid <= 0) | ||
2310 | return -EINVAL; | ||
2311 | |||
2312 | return do_tkill(tgid, pid, sig); | ||
2313 | } | ||
2314 | |||
2332 | /* | 2315 | /* |
2333 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | 2316 | * Send a signal to only one task, even if it's a CLONE_THREAD task. |
2334 | */ | 2317 | */ |
2335 | asmlinkage long | 2318 | asmlinkage long |
2336 | sys_tkill(int pid, int sig) | 2319 | sys_tkill(int pid, int sig) |
2337 | { | 2320 | { |
2338 | struct siginfo info; | ||
2339 | int error; | ||
2340 | struct task_struct *p; | ||
2341 | |||
2342 | /* This is only valid for single tasks */ | 2321 | /* This is only valid for single tasks */ |
2343 | if (pid <= 0) | 2322 | if (pid <= 0) |
2344 | return -EINVAL; | 2323 | return -EINVAL; |
2345 | 2324 | ||
2346 | info.si_signo = sig; | 2325 | return do_tkill(0, pid, sig); |
2347 | info.si_errno = 0; | ||
2348 | info.si_code = SI_TKILL; | ||
2349 | info.si_pid = current->tgid; | ||
2350 | info.si_uid = current->uid; | ||
2351 | |||
2352 | read_lock(&tasklist_lock); | ||
2353 | p = find_task_by_pid(pid); | ||
2354 | error = -ESRCH; | ||
2355 | if (p) { | ||
2356 | error = check_kill_permission(sig, &info, p); | ||
2357 | /* | ||
2358 | * The null signal is a permissions and process existence | ||
2359 | * probe. No signal is actually delivered. | ||
2360 | */ | ||
2361 | if (!error && sig && p->sighand) { | ||
2362 | spin_lock_irq(&p->sighand->siglock); | ||
2363 | handle_stop_signal(sig, p); | ||
2364 | error = specific_send_sig_info(sig, &info, p); | ||
2365 | spin_unlock_irq(&p->sighand->siglock); | ||
2366 | } | ||
2367 | } | ||
2368 | read_unlock(&tasklist_lock); | ||
2369 | return error; | ||
2370 | } | 2326 | } |
2371 | 2327 | ||
2372 | asmlinkage long | 2328 | asmlinkage long |
diff --git a/kernel/time.c b/kernel/time.c index 40c2410ac99a..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) | |||
338 | if (mtemp >= MINSEC) { | 338 | if (mtemp >= MINSEC) { |
339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - | 339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - |
340 | SHIFT_UPDATE); | 340 | SHIFT_UPDATE); |
341 | if (ltemp < 0) | 341 | time_freq += shift_right(ltemp, SHIFT_KH); |
342 | time_freq -= -ltemp >> SHIFT_KH; | ||
343 | else | ||
344 | time_freq += ltemp >> SHIFT_KH; | ||
345 | } else /* calibration interval too short (p. 12) */ | 342 | } else /* calibration interval too short (p. 12) */ |
346 | result = TIME_ERROR; | 343 | result = TIME_ERROR; |
347 | } else { /* PLL mode */ | 344 | } else { /* PLL mode */ |
348 | if (mtemp < MAXSEC) { | 345 | if (mtemp < MAXSEC) { |
349 | ltemp *= mtemp; | 346 | ltemp *= mtemp; |
350 | if (ltemp < 0) | 347 | time_freq += shift_right(ltemp,(time_constant + |
351 | time_freq -= -ltemp >> (time_constant + | ||
352 | time_constant + | ||
353 | SHIFT_KF - SHIFT_USEC); | ||
354 | else | ||
355 | time_freq += ltemp >> (time_constant + | ||
356 | time_constant + | 348 | time_constant + |
357 | SHIFT_KF - SHIFT_USEC); | 349 | SHIFT_KF - SHIFT_USEC)); |
358 | } else /* calibration interval too long (p. 12) */ | 350 | } else /* calibration interval too long (p. 12) */ |
359 | result = TIME_ERROR; | 351 | result = TIME_ERROR; |
360 | } | 352 | } |
361 | if (time_freq > time_tolerance) | 353 | time_freq = min(time_freq, time_tolerance); |
362 | time_freq = time_tolerance; | 354 | time_freq = max(time_freq, -time_tolerance); |
363 | else if (time_freq < -time_tolerance) | ||
364 | time_freq = -time_tolerance; | ||
365 | } /* STA_PLL || STA_PPSTIME */ | 355 | } /* STA_PLL || STA_PPSTIME */ |
366 | } /* txc->modes & ADJ_OFFSET */ | 356 | } /* txc->modes & ADJ_OFFSET */ |
367 | if (txc->modes & ADJ_TICK) { | 357 | if (txc->modes & ADJ_TICK) { |
@@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | |||
384 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 374 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
385 | txc->offset = save_adjust; | 375 | txc->offset = save_adjust; |
386 | else { | 376 | else { |
387 | if (time_offset < 0) | 377 | txc->offset = shift_right(time_offset, SHIFT_UPDATE); |
388 | txc->offset = -(-time_offset >> SHIFT_UPDATE); | ||
389 | else | ||
390 | txc->offset = time_offset >> SHIFT_UPDATE; | ||
391 | } | 378 | } |
392 | txc->freq = time_freq + pps_freq; | 379 | txc->freq = time_freq + pps_freq; |
393 | txc->maxerror = time_maxerror; | 380 | txc->maxerror = time_maxerror; |
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv) | |||
532 | clock_was_set(); | 519 | clock_was_set(); |
533 | return 0; | 520 | return 0; |
534 | } | 521 | } |
522 | EXPORT_SYMBOL(do_settimeofday); | ||
535 | 523 | ||
536 | void do_gettimeofday (struct timeval *tv) | 524 | void do_gettimeofday (struct timeval *tv) |
537 | { | 525 | { |
diff --git a/kernel/timer.c b/kernel/timer.c index 3ba10fa35b60..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); | |||
46 | #define time_interpolator_update(x) | 46 | #define time_interpolator_update(x) |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | ||
50 | |||
51 | EXPORT_SYMBOL(jiffies_64); | ||
52 | |||
49 | /* | 53 | /* |
50 | * per-CPU timer vector definitions: | 54 | * per-CPU timer vector definitions: |
51 | */ | 55 | */ |
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, | |||
91 | #endif | 95 | #endif |
92 | } | 96 | } |
93 | 97 | ||
94 | static void check_timer_failed(struct timer_list *timer) | ||
95 | { | ||
96 | static int whine_count; | ||
97 | if (whine_count < 16) { | ||
98 | whine_count++; | ||
99 | printk("Uninitialised timer!\n"); | ||
100 | printk("This is just a warning. Your computer is OK\n"); | ||
101 | printk("function=0x%p, data=0x%lx\n", | ||
102 | timer->function, timer->data); | ||
103 | dump_stack(); | ||
104 | } | ||
105 | /* | ||
106 | * Now fix it up | ||
107 | */ | ||
108 | timer->magic = TIMER_MAGIC; | ||
109 | } | ||
110 | |||
111 | static inline void check_timer(struct timer_list *timer) | ||
112 | { | ||
113 | if (timer->magic != TIMER_MAGIC) | ||
114 | check_timer_failed(timer); | ||
115 | } | ||
116 | |||
117 | |||
118 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 98 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) |
119 | { | 99 | { |
120 | unsigned long expires = timer->expires; | 100 | unsigned long expires = timer->expires; |
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) | |||
177 | { | 157 | { |
178 | timer->entry.next = NULL; | 158 | timer->entry.next = NULL; |
179 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | 159 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; |
180 | timer->magic = TIMER_MAGIC; | ||
181 | } | 160 | } |
182 | EXPORT_SYMBOL(init_timer); | 161 | EXPORT_SYMBOL(init_timer); |
183 | 162 | ||
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
230 | int ret = 0; | 209 | int ret = 0; |
231 | 210 | ||
232 | BUG_ON(!timer->function); | 211 | BUG_ON(!timer->function); |
233 | check_timer(timer); | ||
234 | 212 | ||
235 | base = lock_timer_base(timer, &flags); | 213 | base = lock_timer_base(timer, &flags); |
236 | 214 | ||
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
283 | unsigned long flags; | 261 | unsigned long flags; |
284 | 262 | ||
285 | BUG_ON(timer_pending(timer) || !timer->function); | 263 | BUG_ON(timer_pending(timer) || !timer->function); |
286 | |||
287 | check_timer(timer); | ||
288 | |||
289 | spin_lock_irqsave(&base->t_base.lock, flags); | 264 | spin_lock_irqsave(&base->t_base.lock, flags); |
290 | timer->base = &base->t_base; | 265 | timer->base = &base->t_base; |
291 | internal_add_timer(base, timer); | 266 | internal_add_timer(base, timer); |
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
316 | { | 291 | { |
317 | BUG_ON(!timer->function); | 292 | BUG_ON(!timer->function); |
318 | 293 | ||
319 | check_timer(timer); | ||
320 | |||
321 | /* | 294 | /* |
322 | * This is a common optimization triggered by the | 295 | * This is a common optimization triggered by the |
323 | * networking code - if the timer is re-modified | 296 | * networking code - if the timer is re-modified |
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) | |||
348 | unsigned long flags; | 321 | unsigned long flags; |
349 | int ret = 0; | 322 | int ret = 0; |
350 | 323 | ||
351 | check_timer(timer); | ||
352 | |||
353 | if (timer_pending(timer)) { | 324 | if (timer_pending(timer)) { |
354 | base = lock_timer_base(timer, &flags); | 325 | base = lock_timer_base(timer, &flags); |
355 | if (timer_pending(timer)) { | 326 | if (timer_pending(timer)) { |
@@ -412,8 +383,6 @@ out: | |||
412 | */ | 383 | */ |
413 | int del_timer_sync(struct timer_list *timer) | 384 | int del_timer_sync(struct timer_list *timer) |
414 | { | 385 | { |
415 | check_timer(timer); | ||
416 | |||
417 | for (;;) { | 386 | for (;;) { |
418 | int ret = try_to_del_timer_sync(timer); | 387 | int ret = try_to_del_timer_sync(timer); |
419 | if (ret >= 0) | 388 | if (ret >= 0) |
@@ -632,134 +601,118 @@ long time_next_adjust; | |||
632 | */ | 601 | */ |
633 | static void second_overflow(void) | 602 | static void second_overflow(void) |
634 | { | 603 | { |
635 | long ltemp; | 604 | long ltemp; |
636 | 605 | ||
637 | /* Bump the maxerror field */ | 606 | /* Bump the maxerror field */ |
638 | time_maxerror += time_tolerance >> SHIFT_USEC; | 607 | time_maxerror += time_tolerance >> SHIFT_USEC; |
639 | if ( time_maxerror > NTP_PHASE_LIMIT ) { | 608 | if (time_maxerror > NTP_PHASE_LIMIT) { |
640 | time_maxerror = NTP_PHASE_LIMIT; | 609 | time_maxerror = NTP_PHASE_LIMIT; |
641 | time_status |= STA_UNSYNC; | 610 | time_status |= STA_UNSYNC; |
642 | } | ||
643 | |||
644 | /* | ||
645 | * Leap second processing. If in leap-insert state at | ||
646 | * the end of the day, the system clock is set back one | ||
647 | * second; if in leap-delete state, the system clock is | ||
648 | * set ahead one second. The microtime() routine or | ||
649 | * external clock driver will insure that reported time | ||
650 | * is always monotonic. The ugly divides should be | ||
651 | * replaced. | ||
652 | */ | ||
653 | switch (time_state) { | ||
654 | |||
655 | case TIME_OK: | ||
656 | if (time_status & STA_INS) | ||
657 | time_state = TIME_INS; | ||
658 | else if (time_status & STA_DEL) | ||
659 | time_state = TIME_DEL; | ||
660 | break; | ||
661 | |||
662 | case TIME_INS: | ||
663 | if (xtime.tv_sec % 86400 == 0) { | ||
664 | xtime.tv_sec--; | ||
665 | wall_to_monotonic.tv_sec++; | ||
666 | /* The timer interpolator will make time change gradually instead | ||
667 | * of an immediate jump by one second. | ||
668 | */ | ||
669 | time_interpolator_update(-NSEC_PER_SEC); | ||
670 | time_state = TIME_OOP; | ||
671 | clock_was_set(); | ||
672 | printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); | ||
673 | } | 611 | } |
674 | break; | 612 | |
675 | 613 | /* | |
676 | case TIME_DEL: | 614 | * Leap second processing. If in leap-insert state at the end of the |
677 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 615 | * day, the system clock is set back one second; if in leap-delete |
678 | xtime.tv_sec++; | 616 | * state, the system clock is set ahead one second. The microtime() |
679 | wall_to_monotonic.tv_sec--; | 617 | * routine or external clock driver will insure that reported time is |
680 | /* Use of time interpolator for a gradual change of time */ | 618 | * always monotonic. The ugly divides should be replaced. |
681 | time_interpolator_update(NSEC_PER_SEC); | 619 | */ |
682 | time_state = TIME_WAIT; | 620 | switch (time_state) { |
683 | clock_was_set(); | 621 | case TIME_OK: |
684 | printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); | 622 | if (time_status & STA_INS) |
623 | time_state = TIME_INS; | ||
624 | else if (time_status & STA_DEL) | ||
625 | time_state = TIME_DEL; | ||
626 | break; | ||
627 | case TIME_INS: | ||
628 | if (xtime.tv_sec % 86400 == 0) { | ||
629 | xtime.tv_sec--; | ||
630 | wall_to_monotonic.tv_sec++; | ||
631 | /* | ||
632 | * The timer interpolator will make time change | ||
633 | * gradually instead of an immediate jump by one second | ||
634 | */ | ||
635 | time_interpolator_update(-NSEC_PER_SEC); | ||
636 | time_state = TIME_OOP; | ||
637 | clock_was_set(); | ||
638 | printk(KERN_NOTICE "Clock: inserting leap second " | ||
639 | "23:59:60 UTC\n"); | ||
640 | } | ||
641 | break; | ||
642 | case TIME_DEL: | ||
643 | if ((xtime.tv_sec + 1) % 86400 == 0) { | ||
644 | xtime.tv_sec++; | ||
645 | wall_to_monotonic.tv_sec--; | ||
646 | /* | ||
647 | * Use of time interpolator for a gradual change of | ||
648 | * time | ||
649 | */ | ||
650 | time_interpolator_update(NSEC_PER_SEC); | ||
651 | time_state = TIME_WAIT; | ||
652 | clock_was_set(); | ||
653 | printk(KERN_NOTICE "Clock: deleting leap second " | ||
654 | "23:59:59 UTC\n"); | ||
655 | } | ||
656 | break; | ||
657 | case TIME_OOP: | ||
658 | time_state = TIME_WAIT; | ||
659 | break; | ||
660 | case TIME_WAIT: | ||
661 | if (!(time_status & (STA_INS | STA_DEL))) | ||
662 | time_state = TIME_OK; | ||
685 | } | 663 | } |
686 | break; | 664 | |
687 | 665 | /* | |
688 | case TIME_OOP: | 666 | * Compute the phase adjustment for the next second. In PLL mode, the |
689 | time_state = TIME_WAIT; | 667 | * offset is reduced by a fixed factor times the time constant. In FLL |
690 | break; | 668 | * mode the offset is used directly. In either mode, the maximum phase |
691 | 669 | * adjustment for each second is clamped so as to spread the adjustment | |
692 | case TIME_WAIT: | 670 | * over not more than the number of seconds between updates. |
693 | if (!(time_status & (STA_INS | STA_DEL))) | 671 | */ |
694 | time_state = TIME_OK; | ||
695 | } | ||
696 | |||
697 | /* | ||
698 | * Compute the phase adjustment for the next second. In | ||
699 | * PLL mode, the offset is reduced by a fixed factor | ||
700 | * times the time constant. In FLL mode the offset is | ||
701 | * used directly. In either mode, the maximum phase | ||
702 | * adjustment for each second is clamped so as to spread | ||
703 | * the adjustment over not more than the number of | ||
704 | * seconds between updates. | ||
705 | */ | ||
706 | if (time_offset < 0) { | ||
707 | ltemp = -time_offset; | ||
708 | if (!(time_status & STA_FLL)) | ||
709 | ltemp >>= SHIFT_KG + time_constant; | ||
710 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | ||
711 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | ||
712 | time_offset += ltemp; | ||
713 | time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | ||
714 | } else { | ||
715 | ltemp = time_offset; | 672 | ltemp = time_offset; |
716 | if (!(time_status & STA_FLL)) | 673 | if (!(time_status & STA_FLL)) |
717 | ltemp >>= SHIFT_KG + time_constant; | 674 | ltemp = shift_right(ltemp, SHIFT_KG + time_constant); |
718 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | 675 | ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); |
719 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | 676 | ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); |
720 | time_offset -= ltemp; | 677 | time_offset -= ltemp; |
721 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | 678 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); |
722 | } | 679 | |
723 | 680 | /* | |
724 | /* | 681 | * Compute the frequency estimate and additional phase adjustment due |
725 | * Compute the frequency estimate and additional phase | 682 | * to frequency error for the next second. When the PPS signal is |
726 | * adjustment due to frequency error for the next | 683 | * engaged, gnaw on the watchdog counter and update the frequency |
727 | * second. When the PPS signal is engaged, gnaw on the | 684 | * computed by the pll and the PPS signal. |
728 | * watchdog counter and update the frequency computed by | 685 | */ |
729 | * the pll and the PPS signal. | 686 | pps_valid++; |
730 | */ | 687 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ |
731 | pps_valid++; | 688 | pps_jitter = MAXTIME; |
732 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | 689 | pps_stabil = MAXFREQ; |
733 | pps_jitter = MAXTIME; | 690 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | |
734 | pps_stabil = MAXFREQ; | 691 | STA_PPSWANDER | STA_PPSERROR); |
735 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | 692 | } |
736 | STA_PPSWANDER | STA_PPSERROR); | 693 | ltemp = time_freq + pps_freq; |
737 | } | 694 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); |
738 | ltemp = time_freq + pps_freq; | ||
739 | if (ltemp < 0) | ||
740 | time_adj -= -ltemp >> | ||
741 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
742 | else | ||
743 | time_adj += ltemp >> | ||
744 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
745 | 695 | ||
746 | #if HZ == 100 | 696 | #if HZ == 100 |
747 | /* Compensate for (HZ==100) != (1 << SHIFT_HZ). | 697 | /* |
748 | * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) | 698 | * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to |
749 | */ | 699 | * get 128.125; => only 0.125% error (p. 14) |
750 | if (time_adj < 0) | 700 | */ |
751 | time_adj -= (-time_adj >> 2) + (-time_adj >> 5); | 701 | time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); |
752 | else | 702 | #endif |
753 | time_adj += (time_adj >> 2) + (time_adj >> 5); | 703 | #if HZ == 250 |
704 | /* | ||
705 | * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and | ||
706 | * 0.78125% to get 255.85938; => only 0.05% error (p. 14) | ||
707 | */ | ||
708 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); | ||
754 | #endif | 709 | #endif |
755 | #if HZ == 1000 | 710 | #if HZ == 1000 |
756 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). | 711 | /* |
757 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | 712 | * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and |
758 | */ | 713 | * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) |
759 | if (time_adj < 0) | 714 | */ |
760 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | 715 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
761 | else | ||
762 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
763 | #endif | 716 | #endif |
764 | } | 717 | } |
765 | 718 | ||
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void) | |||
768 | { | 721 | { |
769 | long time_adjust_step, delta_nsec; | 722 | long time_adjust_step, delta_nsec; |
770 | 723 | ||
771 | if ( (time_adjust_step = time_adjust) != 0 ) { | 724 | if ((time_adjust_step = time_adjust) != 0 ) { |
772 | /* We are doing an adjtime thing. | 725 | /* |
773 | * | 726 | * We are doing an adjtime thing. Prepare time_adjust_step to |
774 | * Prepare time_adjust_step to be within bounds. | 727 | * be within bounds. Note that a positive time_adjust means we |
775 | * Note that a positive time_adjust means we want the clock | 728 | * want the clock to run faster. |
776 | * to run faster. | 729 | * |
777 | * | 730 | * Limit the amount of the step to be in the range |
778 | * Limit the amount of the step to be in the range | 731 | * -tickadj .. +tickadj |
779 | * -tickadj .. +tickadj | 732 | */ |
780 | */ | 733 | time_adjust_step = min(time_adjust_step, (long)tickadj); |
781 | if (time_adjust > tickadj) | 734 | time_adjust_step = max(time_adjust_step, (long)-tickadj); |
782 | time_adjust_step = tickadj; | 735 | |
783 | else if (time_adjust < -tickadj) | 736 | /* Reduce by this step the amount of time left */ |
784 | time_adjust_step = -tickadj; | 737 | time_adjust -= time_adjust_step; |
785 | |||
786 | /* Reduce by this step the amount of time left */ | ||
787 | time_adjust -= time_adjust_step; | ||
788 | } | 738 | } |
789 | delta_nsec = tick_nsec + time_adjust_step * 1000; | 739 | delta_nsec = tick_nsec + time_adjust_step * 1000; |
790 | /* | 740 | /* |
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void) | |||
792 | * advance the tick more. | 742 | * advance the tick more. |
793 | */ | 743 | */ |
794 | time_phase += time_adj; | 744 | time_phase += time_adj; |
795 | if (time_phase <= -FINENSEC) { | 745 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { |
796 | long ltemp = -time_phase >> (SHIFT_SCALE - 10); | 746 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); |
797 | time_phase += ltemp << (SHIFT_SCALE - 10); | ||
798 | delta_nsec -= ltemp; | ||
799 | } | ||
800 | else if (time_phase >= FINENSEC) { | ||
801 | long ltemp = time_phase >> (SHIFT_SCALE - 10); | ||
802 | time_phase -= ltemp << (SHIFT_SCALE - 10); | 747 | time_phase -= ltemp << (SHIFT_SCALE - 10); |
803 | delta_nsec += ltemp; | 748 | delta_nsec += ltemp; |
804 | } | 749 | } |
@@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
1128 | if (timeout < 0) | 1073 | if (timeout < 0) |
1129 | { | 1074 | { |
1130 | printk(KERN_ERR "schedule_timeout: wrong timeout " | 1075 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
1131 | "value %lx from %p\n", timeout, | 1076 | "value %lx from %p\n", timeout, |
1132 | __builtin_return_address(0)); | 1077 | __builtin_return_address(0)); |
1133 | current->state = TASK_RUNNING; | 1078 | current->state = TASK_RUNNING; |
1134 | goto out; | 1079 | goto out; |
1135 | } | 1080 | } |
@@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
1137 | 1082 | ||
1138 | expire = timeout + jiffies; | 1083 | expire = timeout + jiffies; |
1139 | 1084 | ||
1140 | init_timer(&timer); | 1085 | setup_timer(&timer, process_timeout, (unsigned long)current); |
1141 | timer.expires = expire; | 1086 | __mod_timer(&timer, expire); |
1142 | timer.data = (unsigned long) current; | ||
1143 | timer.function = process_timeout; | ||
1144 | |||
1145 | add_timer(&timer); | ||
1146 | schedule(); | 1087 | schedule(); |
1147 | del_singleshot_timer_sync(&timer); | 1088 | del_singleshot_timer_sync(&timer); |
1148 | 1089 | ||
@@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout); | |||
1159 | */ | 1100 | */ |
1160 | signed long __sched schedule_timeout_interruptible(signed long timeout) | 1101 | signed long __sched schedule_timeout_interruptible(signed long timeout) |
1161 | { | 1102 | { |
1162 | __set_current_state(TASK_INTERRUPTIBLE); | 1103 | __set_current_state(TASK_INTERRUPTIBLE); |
1163 | return schedule_timeout(timeout); | 1104 | return schedule_timeout(timeout); |
1164 | } | 1105 | } |
1165 | EXPORT_SYMBOL(schedule_timeout_interruptible); | 1106 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
1166 | 1107 | ||
1167 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | 1108 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
1168 | { | 1109 | { |
1169 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1110 | __set_current_state(TASK_UNINTERRUPTIBLE); |
1170 | return schedule_timeout(timeout); | 1111 | return schedule_timeout(timeout); |
1171 | } | 1112 | } |
1172 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1113 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1173 | 1114 | ||
@@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) | |||
1507 | if (!time_interpolator) | 1448 | if (!time_interpolator) |
1508 | return; | 1449 | return; |
1509 | 1450 | ||
1510 | /* The interpolator compensates for late ticks by accumulating | 1451 | /* |
1511 | * the late time in time_interpolator->offset. A tick earlier than | 1452 | * The interpolator compensates for late ticks by accumulating the late |
1512 | * expected will lead to a reset of the offset and a corresponding | 1453 | * time in time_interpolator->offset. A tick earlier than expected will |
1513 | * jump of the clock forward. Again this only works if the | 1454 | * lead to a reset of the offset and a corresponding jump of the clock |
1514 | * interpolator clock is running slightly slower than the regular clock | 1455 | * forward. Again this only works if the interpolator clock is running |
1515 | * and the tuning logic insures that. | 1456 | * slightly slower than the regular clock and the tuning logic insures |
1516 | */ | 1457 | * that. |
1458 | */ | ||
1517 | 1459 | ||
1518 | counter = time_interpolator_get_counter(1); | 1460 | counter = time_interpolator_get_counter(1); |
1519 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | 1461 | offset = time_interpolator->offset + |
1462 | GET_TI_NSECS(counter, time_interpolator); | ||
1520 | 1463 | ||
1521 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1464 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
1522 | time_interpolator->offset = offset - delta_nsec; | 1465 | time_interpolator->offset = offset - delta_nsec; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91bacb13a7e2..7cee222231bc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -12,6 +12,8 @@ | |||
12 | * Andrew Morton <andrewm@uow.edu.au> | 12 | * Andrew Morton <andrewm@uow.edu.au> |
13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | 13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> |
14 | * Theodore Ts'o <tytso@mit.edu> | 14 | * Theodore Ts'o <tytso@mit.edu> |
15 | * | ||
16 | * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. | ||
15 | */ | 17 | */ |
16 | 18 | ||
17 | #include <linux/module.h> | 19 | #include <linux/module.h> |
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct { | |||
57 | * per-CPU workqueues: | 59 | * per-CPU workqueues: |
58 | */ | 60 | */ |
59 | struct workqueue_struct { | 61 | struct workqueue_struct { |
60 | struct cpu_workqueue_struct cpu_wq[NR_CPUS]; | 62 | struct cpu_workqueue_struct *cpu_wq; |
61 | const char *name; | 63 | const char *name; |
62 | struct list_head list; /* Empty if single thread */ | 64 | struct list_head list; /* Empty if single thread */ |
63 | }; | 65 | }; |
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
102 | if (unlikely(is_single_threaded(wq))) | 104 | if (unlikely(is_single_threaded(wq))) |
103 | cpu = 0; | 105 | cpu = 0; |
104 | BUG_ON(!list_empty(&work->entry)); | 106 | BUG_ON(!list_empty(&work->entry)); |
105 | __queue_work(wq->cpu_wq + cpu, work); | 107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
106 | ret = 1; | 108 | ret = 1; |
107 | } | 109 | } |
108 | put_cpu(); | 110 | put_cpu(); |
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
118 | if (unlikely(is_single_threaded(wq))) | 120 | if (unlikely(is_single_threaded(wq))) |
119 | cpu = 0; | 121 | cpu = 0; |
120 | 122 | ||
121 | __queue_work(wq->cpu_wq + cpu, work); | 123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
122 | } | 124 | } |
123 | 125 | ||
124 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 126 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
265 | 267 | ||
266 | if (is_single_threaded(wq)) { | 268 | if (is_single_threaded(wq)) { |
267 | /* Always use cpu 0's area. */ | 269 | /* Always use cpu 0's area. */ |
268 | flush_cpu_workqueue(wq->cpu_wq + 0); | 270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); |
269 | } else { | 271 | } else { |
270 | int cpu; | 272 | int cpu; |
271 | 273 | ||
272 | lock_cpu_hotplug(); | 274 | lock_cpu_hotplug(); |
273 | for_each_online_cpu(cpu) | 275 | for_each_online_cpu(cpu) |
274 | flush_cpu_workqueue(wq->cpu_wq + cpu); | 276 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); |
275 | unlock_cpu_hotplug(); | 277 | unlock_cpu_hotplug(); |
276 | } | 278 | } |
277 | } | 279 | } |
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
279 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 281 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
280 | int cpu) | 282 | int cpu) |
281 | { | 283 | { |
282 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 284 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
283 | struct task_struct *p; | 285 | struct task_struct *p; |
284 | 286 | ||
285 | spin_lock_init(&cwq->lock); | 287 | spin_lock_init(&cwq->lock); |
@@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
312 | if (!wq) | 314 | if (!wq) |
313 | return NULL; | 315 | return NULL; |
314 | 316 | ||
317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | ||
315 | wq->name = name; | 318 | wq->name = name; |
316 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 319 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
317 | lock_cpu_hotplug(); | 320 | lock_cpu_hotplug(); |
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | |||
353 | unsigned long flags; | 356 | unsigned long flags; |
354 | struct task_struct *p; | 357 | struct task_struct *p; |
355 | 358 | ||
356 | cwq = wq->cpu_wq + cpu; | 359 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
357 | spin_lock_irqsave(&cwq->lock, flags); | 360 | spin_lock_irqsave(&cwq->lock, flags); |
358 | p = cwq->thread; | 361 | p = cwq->thread; |
359 | cwq->thread = NULL; | 362 | cwq->thread = NULL; |
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
380 | spin_unlock(&workqueue_lock); | 383 | spin_unlock(&workqueue_lock); |
381 | } | 384 | } |
382 | unlock_cpu_hotplug(); | 385 | unlock_cpu_hotplug(); |
386 | free_percpu(wq->cpu_wq); | ||
383 | kfree(wq); | 387 | kfree(wq); |
384 | } | 388 | } |
385 | 389 | ||
@@ -458,7 +462,7 @@ int current_is_keventd(void) | |||
458 | 462 | ||
459 | BUG_ON(!keventd_wq); | 463 | BUG_ON(!keventd_wq); |
460 | 464 | ||
461 | cwq = keventd_wq->cpu_wq + cpu; | 465 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); |
462 | if (current == cwq->thread) | 466 | if (current == cwq->thread) |
463 | ret = 1; | 467 | ret = 1; |
464 | 468 | ||
@@ -470,7 +474,7 @@ int current_is_keventd(void) | |||
470 | /* Take the work from this (downed) CPU. */ | 474 | /* Take the work from this (downed) CPU. */ |
471 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 475 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
472 | { | 476 | { |
473 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 477 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
474 | LIST_HEAD(list); | 478 | LIST_HEAD(list); |
475 | struct work_struct *work; | 479 | struct work_struct *work; |
476 | 480 | ||
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
481 | printk("Taking work for %s\n", wq->name); | 485 | printk("Taking work for %s\n", wq->name); |
482 | work = list_entry(list.next,struct work_struct,entry); | 486 | work = list_entry(list.next,struct work_struct,entry); |
483 | list_del(&work->entry); | 487 | list_del(&work->entry); |
484 | __queue_work(wq->cpu_wq + smp_processor_id(), work); | 488 | __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); |
485 | } | 489 | } |
486 | spin_unlock_irq(&cwq->lock); | 490 | spin_unlock_irq(&cwq->lock); |
487 | } | 491 | } |
@@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
508 | case CPU_ONLINE: | 512 | case CPU_ONLINE: |
509 | /* Kick off worker threads. */ | 513 | /* Kick off worker threads. */ |
510 | list_for_each_entry(wq, &workqueues, list) { | 514 | list_for_each_entry(wq, &workqueues, list) { |
511 | kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); | 515 | struct cpu_workqueue_struct *cwq; |
512 | wake_up_process(wq->cpu_wq[hotcpu].thread); | 516 | |
517 | cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); | ||
518 | kthread_bind(cwq->thread, hotcpu); | ||
519 | wake_up_process(cwq->thread); | ||
513 | } | 520 | } |
514 | break; | 521 | break; |
515 | 522 | ||
516 | case CPU_UP_CANCELED: | 523 | case CPU_UP_CANCELED: |
517 | list_for_each_entry(wq, &workqueues, list) { | 524 | list_for_each_entry(wq, &workqueues, list) { |
518 | /* Unbind so it can run. */ | 525 | /* Unbind so it can run. */ |
519 | kthread_bind(wq->cpu_wq[hotcpu].thread, | 526 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
520 | smp_processor_id()); | 527 | smp_processor_id()); |
521 | cleanup_workqueue_thread(wq, hotcpu); | 528 | cleanup_workqueue_thread(wq, hotcpu); |
522 | } | 529 | } |