diff options
Diffstat (limited to 'kernel')
43 files changed, 2573 insertions, 1620 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index ff4dc02ce170..4f5a1453093a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o | |||
22 | obj-$(CONFIG_COMPAT) += compat.o | 22 | obj-$(CONFIG_COMPAT) += compat.o |
23 | obj-$(CONFIG_CPUSETS) += cpuset.o | 23 | obj-$(CONFIG_CPUSETS) += cpuset.o |
24 | obj-$(CONFIG_IKCONFIG) += configs.o | 24 | obj-$(CONFIG_IKCONFIG) += configs.o |
25 | obj-$(CONFIG_IKCONFIG_PROC) += configs.o | ||
26 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 25 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
27 | obj-$(CONFIG_AUDIT) += audit.o | 26 | obj-$(CONFIG_AUDIT) += audit.o |
28 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
@@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
32 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 31 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
33 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 32 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
34 | obj-$(CONFIG_SECCOMP) += seccomp.o | 33 | obj-$(CONFIG_SECCOMP) += seccomp.o |
34 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | ||
35 | 35 | ||
36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 36 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 37 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index b756f527497e..6312d6bd43e3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <linux/jiffies.h> | 54 | #include <linux/jiffies.h> |
55 | #include <linux/times.h> | 55 | #include <linux/times.h> |
56 | #include <linux/syscalls.h> | 56 | #include <linux/syscalls.h> |
57 | #include <linux/mount.h> | ||
57 | #include <asm/uaccess.h> | 58 | #include <asm/uaccess.h> |
58 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
59 | #include <linux/blkdev.h> /* sector_div */ | 60 | #include <linux/blkdev.h> /* sector_div */ |
@@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file) | |||
192 | add_timer(&acct_globals.timer); | 193 | add_timer(&acct_globals.timer); |
193 | } | 194 | } |
194 | if (old_acct) { | 195 | if (old_acct) { |
196 | mnt_unpin(old_acct->f_vfsmnt); | ||
195 | spin_unlock(&acct_globals.lock); | 197 | spin_unlock(&acct_globals.lock); |
196 | do_acct_process(0, old_acct); | 198 | do_acct_process(0, old_acct); |
197 | filp_close(old_acct, NULL); | 199 | filp_close(old_acct, NULL); |
@@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file) | |||
199 | } | 201 | } |
200 | } | 202 | } |
201 | 203 | ||
204 | static int acct_on(char *name) | ||
205 | { | ||
206 | struct file *file; | ||
207 | int error; | ||
208 | |||
209 | /* Difference from BSD - they don't do O_APPEND */ | ||
210 | file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); | ||
211 | if (IS_ERR(file)) | ||
212 | return PTR_ERR(file); | ||
213 | |||
214 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | ||
215 | filp_close(file, NULL); | ||
216 | return -EACCES; | ||
217 | } | ||
218 | |||
219 | if (!file->f_op->write) { | ||
220 | filp_close(file, NULL); | ||
221 | return -EIO; | ||
222 | } | ||
223 | |||
224 | error = security_acct(file); | ||
225 | if (error) { | ||
226 | filp_close(file, NULL); | ||
227 | return error; | ||
228 | } | ||
229 | |||
230 | spin_lock(&acct_globals.lock); | ||
231 | mnt_pin(file->f_vfsmnt); | ||
232 | acct_file_reopen(file); | ||
233 | spin_unlock(&acct_globals.lock); | ||
234 | |||
235 | mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | |||
202 | /** | 240 | /** |
203 | * sys_acct - enable/disable process accounting | 241 | * sys_acct - enable/disable process accounting |
204 | * @name: file name for accounting records or NULL to shutdown accounting | 242 | * @name: file name for accounting records or NULL to shutdown accounting |
@@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file) | |||
212 | */ | 250 | */ |
213 | asmlinkage long sys_acct(const char __user *name) | 251 | asmlinkage long sys_acct(const char __user *name) |
214 | { | 252 | { |
215 | struct file *file = NULL; | ||
216 | char *tmp; | ||
217 | int error; | 253 | int error; |
218 | 254 | ||
219 | if (!capable(CAP_SYS_PACCT)) | 255 | if (!capable(CAP_SYS_PACCT)) |
220 | return -EPERM; | 256 | return -EPERM; |
221 | 257 | ||
222 | if (name) { | 258 | if (name) { |
223 | tmp = getname(name); | 259 | char *tmp = getname(name); |
224 | if (IS_ERR(tmp)) { | 260 | if (IS_ERR(tmp)) |
225 | return (PTR_ERR(tmp)); | 261 | return (PTR_ERR(tmp)); |
226 | } | 262 | error = acct_on(tmp); |
227 | /* Difference from BSD - they don't do O_APPEND */ | ||
228 | file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); | ||
229 | putname(tmp); | 263 | putname(tmp); |
230 | if (IS_ERR(file)) { | 264 | } else { |
231 | return (PTR_ERR(file)); | 265 | error = security_acct(NULL); |
232 | } | 266 | if (!error) { |
233 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | 267 | spin_lock(&acct_globals.lock); |
234 | filp_close(file, NULL); | 268 | acct_file_reopen(NULL); |
235 | return (-EACCES); | 269 | spin_unlock(&acct_globals.lock); |
236 | } | ||
237 | |||
238 | if (!file->f_op->write) { | ||
239 | filp_close(file, NULL); | ||
240 | return (-EIO); | ||
241 | } | 270 | } |
242 | } | 271 | } |
272 | return error; | ||
273 | } | ||
243 | 274 | ||
244 | error = security_acct(file); | 275 | /** |
245 | if (error) { | 276 | * acct_auto_close - turn off a filesystem's accounting if it is on |
246 | if (file) | 277 | * @m: vfsmount being shut down |
247 | filp_close(file, NULL); | 278 | * |
248 | return error; | 279 | * If the accounting is turned on for a file in the subtree pointed to |
249 | } | 280 | * to by m, turn accounting off. Done when m is about to die. |
250 | 281 | */ | |
282 | void acct_auto_close_mnt(struct vfsmount *m) | ||
283 | { | ||
251 | spin_lock(&acct_globals.lock); | 284 | spin_lock(&acct_globals.lock); |
252 | acct_file_reopen(file); | 285 | if (acct_globals.file && acct_globals.file->f_vfsmnt == m) |
286 | acct_file_reopen(NULL); | ||
253 | spin_unlock(&acct_globals.lock); | 287 | spin_unlock(&acct_globals.lock); |
254 | |||
255 | return (0); | ||
256 | } | 288 | } |
257 | 289 | ||
258 | /** | 290 | /** |
@@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb) | |||
266 | { | 298 | { |
267 | spin_lock(&acct_globals.lock); | 299 | spin_lock(&acct_globals.lock); |
268 | if (acct_globals.file && | 300 | if (acct_globals.file && |
269 | acct_globals.file->f_dentry->d_inode->i_sb == sb) { | 301 | acct_globals.file->f_vfsmnt->mnt_sb == sb) { |
270 | acct_file_reopen((struct file *)NULL); | 302 | acct_file_reopen(NULL); |
271 | } | 303 | } |
272 | spin_unlock(&acct_globals.lock); | 304 | spin_unlock(&acct_globals.lock); |
273 | } | 305 | } |
@@ -553,7 +585,7 @@ void acct_update_integrals(struct task_struct *tsk) | |||
553 | if (delta == 0) | 585 | if (delta == 0) |
554 | return; | 586 | return; |
555 | tsk->acct_stimexpd = tsk->stime; | 587 | tsk->acct_stimexpd = tsk->stime; |
556 | tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); | 588 | tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); |
557 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; | 589 | tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; |
558 | } | 590 | } |
559 | } | 591 | } |
diff --git a/kernel/audit.c b/kernel/audit.c index 83096b67510a..0c56320d38dc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -133,7 +133,7 @@ struct audit_buffer { | |||
133 | struct list_head list; | 133 | struct list_head list; |
134 | struct sk_buff *skb; /* formatted skb ready to send */ | 134 | struct sk_buff *skb; /* formatted skb ready to send */ |
135 | struct audit_context *ctx; /* NULL or associated context */ | 135 | struct audit_context *ctx; /* NULL or associated context */ |
136 | int gfp_mask; | 136 | gfp_t gfp_mask; |
137 | }; | 137 | }; |
138 | 138 | ||
139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 139 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab) | |||
560 | } | 560 | } |
561 | 561 | ||
562 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, | 562 | static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, |
563 | unsigned int __nocast gfp_mask, int type) | 563 | gfp_t gfp_mask, int type) |
564 | { | 564 | { |
565 | unsigned long flags; | 565 | unsigned long flags; |
566 | struct audit_buffer *ab = NULL; | 566 | struct audit_buffer *ab = NULL; |
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx, | |||
647 | * will be written at syscall exit. If there is no associated task, tsk | 647 | * will be written at syscall exit. If there is no associated task, tsk |
648 | * should be NULL. */ | 648 | * should be NULL. */ |
649 | 649 | ||
650 | struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, | 650 | struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, |
651 | int type) | 651 | int type) |
652 | { | 652 | { |
653 | struct audit_buffer *ab = NULL; | 653 | struct audit_buffer *ab = NULL; |
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab) | |||
879 | /* Log an audit record. This is a convenience function that calls | 879 | /* Log an audit record. This is a convenience function that calls |
880 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be | 880 | * audit_log_start, audit_log_vformat, and audit_log_end. It may be |
881 | * called in any context. */ | 881 | * called in any context. */ |
882 | void audit_log(struct audit_context *ctx, int gfp_mask, int type, | 882 | void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, |
883 | const char *fmt, ...) | 883 | const char *fmt, ...) |
884 | { | 884 | { |
885 | struct audit_buffer *ab; | 885 | struct audit_buffer *ab; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 88696f639aab..d8a68509e729 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab) | |||
803 | up_read(&mm->mmap_sem); | 803 | up_read(&mm->mmap_sem); |
804 | } | 804 | } |
805 | 805 | ||
806 | static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) | 806 | static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask) |
807 | { | 807 | { |
808 | int i; | 808 | int i; |
809 | struct audit_buffer *ab; | 809 | struct audit_buffer *ab; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 53d8263ae12e..e882c6babf41 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -16,28 +16,76 @@ | |||
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | DECLARE_MUTEX(cpucontrol); | 19 | static DECLARE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static struct notifier_block *cpu_chain; | 21 | static struct notifier_block *cpu_chain; |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | ||
24 | static struct task_struct *lock_cpu_hotplug_owner; | ||
25 | static int lock_cpu_hotplug_depth; | ||
26 | |||
27 | static int __lock_cpu_hotplug(int interruptible) | ||
28 | { | ||
29 | int ret = 0; | ||
30 | |||
31 | if (lock_cpu_hotplug_owner != current) { | ||
32 | if (interruptible) | ||
33 | ret = down_interruptible(&cpucontrol); | ||
34 | else | ||
35 | down(&cpucontrol); | ||
36 | } | ||
37 | |||
38 | /* | ||
39 | * Set only if we succeed in locking | ||
40 | */ | ||
41 | if (!ret) { | ||
42 | lock_cpu_hotplug_depth++; | ||
43 | lock_cpu_hotplug_owner = current; | ||
44 | } | ||
45 | |||
46 | return ret; | ||
47 | } | ||
48 | |||
49 | void lock_cpu_hotplug(void) | ||
50 | { | ||
51 | __lock_cpu_hotplug(0); | ||
52 | } | ||
53 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | ||
54 | |||
55 | void unlock_cpu_hotplug(void) | ||
56 | { | ||
57 | if (--lock_cpu_hotplug_depth == 0) { | ||
58 | lock_cpu_hotplug_owner = NULL; | ||
59 | up(&cpucontrol); | ||
60 | } | ||
61 | } | ||
62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | ||
63 | |||
64 | int lock_cpu_hotplug_interruptible(void) | ||
65 | { | ||
66 | return __lock_cpu_hotplug(1); | ||
67 | } | ||
68 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | ||
69 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
70 | |||
23 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
24 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int register_cpu_notifier(struct notifier_block *nb) |
25 | { | 73 | { |
26 | int ret; | 74 | int ret; |
27 | 75 | ||
28 | if ((ret = down_interruptible(&cpucontrol)) != 0) | 76 | if ((ret = lock_cpu_hotplug_interruptible()) != 0) |
29 | return ret; | 77 | return ret; |
30 | ret = notifier_chain_register(&cpu_chain, nb); | 78 | ret = notifier_chain_register(&cpu_chain, nb); |
31 | up(&cpucontrol); | 79 | unlock_cpu_hotplug(); |
32 | return ret; | 80 | return ret; |
33 | } | 81 | } |
34 | EXPORT_SYMBOL(register_cpu_notifier); | 82 | EXPORT_SYMBOL(register_cpu_notifier); |
35 | 83 | ||
36 | void unregister_cpu_notifier(struct notifier_block *nb) | 84 | void unregister_cpu_notifier(struct notifier_block *nb) |
37 | { | 85 | { |
38 | down(&cpucontrol); | 86 | lock_cpu_hotplug(); |
39 | notifier_chain_unregister(&cpu_chain, nb); | 87 | notifier_chain_unregister(&cpu_chain, nb); |
40 | up(&cpucontrol); | 88 | unlock_cpu_hotplug(); |
41 | } | 89 | } |
42 | EXPORT_SYMBOL(unregister_cpu_notifier); | 90 | EXPORT_SYMBOL(unregister_cpu_notifier); |
43 | 91 | ||
@@ -155,13 +203,14 @@ int __devinit cpu_up(unsigned int cpu) | |||
155 | int ret; | 203 | int ret; |
156 | void *hcpu = (void *)(long)cpu; | 204 | void *hcpu = (void *)(long)cpu; |
157 | 205 | ||
158 | if ((ret = down_interruptible(&cpucontrol)) != 0) | 206 | if ((ret = lock_cpu_hotplug_interruptible()) != 0) |
159 | return ret; | 207 | return ret; |
160 | 208 | ||
161 | if (cpu_online(cpu) || !cpu_present(cpu)) { | 209 | if (cpu_online(cpu) || !cpu_present(cpu)) { |
162 | ret = -EINVAL; | 210 | ret = -EINVAL; |
163 | goto out; | 211 | goto out; |
164 | } | 212 | } |
213 | |||
165 | ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); | 214 | ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); |
166 | if (ret == NOTIFY_BAD) { | 215 | if (ret == NOTIFY_BAD) { |
167 | printk("%s: attempt to bring up CPU %u failed\n", | 216 | printk("%s: attempt to bring up CPU %u failed\n", |
@@ -184,6 +233,6 @@ out_notify: | |||
184 | if (ret != 0) | 233 | if (ret != 0) |
185 | notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); | 234 | notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); |
186 | out: | 235 | out: |
187 | up(&cpucontrol); | 236 | unlock_cpu_hotplug(); |
188 | return ret; | 237 | return ret; |
189 | } | 238 | } |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 79866bc6b3a1..7430640f9816 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/kernel.h> | 32 | #include <linux/kernel.h> |
33 | #include <linux/kmod.h> | 33 | #include <linux/kmod.h> |
34 | #include <linux/list.h> | 34 | #include <linux/list.h> |
35 | #include <linux/mempolicy.h> | ||
35 | #include <linux/mm.h> | 36 | #include <linux/mm.h> |
36 | #include <linux/module.h> | 37 | #include <linux/module.h> |
37 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
@@ -60,6 +61,9 @@ struct cpuset { | |||
60 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 61 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
61 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 62 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
62 | 63 | ||
64 | /* | ||
65 | * Count is atomic so can incr (fork) or decr (exit) without a lock. | ||
66 | */ | ||
63 | atomic_t count; /* count tasks using this cpuset */ | 67 | atomic_t count; /* count tasks using this cpuset */ |
64 | 68 | ||
65 | /* | 69 | /* |
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount; | |||
142 | static struct super_block *cpuset_sb = NULL; | 146 | static struct super_block *cpuset_sb = NULL; |
143 | 147 | ||
144 | /* | 148 | /* |
145 | * cpuset_sem should be held by anyone who is depending on the children | 149 | * We have two global cpuset semaphores below. They can nest. |
146 | * or sibling lists of any cpuset, or performing non-atomic operations | 150 | * It is ok to first take manage_sem, then nest callback_sem. We also |
147 | * on the flags or *_allowed values of a cpuset, such as raising the | 151 | * require taking task_lock() when dereferencing a tasks cpuset pointer. |
148 | * CS_REMOVED flag bit iff it is not already raised, or reading and | 152 | * See "The task_lock() exception", at the end of this comment. |
149 | * conditionally modifying the *_allowed values. One kernel global | 153 | * |
150 | * cpuset semaphore should be sufficient - these things don't change | 154 | * A task must hold both semaphores to modify cpusets. If a task |
151 | * that much. | 155 | * holds manage_sem, then it blocks others wanting that semaphore, |
152 | * | 156 | * ensuring that it is the only task able to also acquire callback_sem |
153 | * The code that modifies cpusets holds cpuset_sem across the entire | 157 | * and be able to modify cpusets. It can perform various checks on |
154 | * operation, from cpuset_common_file_write() down, single threading | 158 | * the cpuset structure first, knowing nothing will change. It can |
155 | * all cpuset modifications (except for counter manipulations from | 159 | * also allocate memory while just holding manage_sem. While it is |
156 | * fork and exit) across the system. This presumes that cpuset | 160 | * performing these checks, various callback routines can briefly |
157 | * modifications are rare - better kept simple and safe, even if slow. | 161 | * acquire callback_sem to query cpusets. Once it is ready to make |
158 | * | 162 | * the changes, it takes callback_sem, blocking everyone else. |
159 | * The code that reads cpusets, such as in cpuset_common_file_read() | 163 | * |
160 | * and below, only holds cpuset_sem across small pieces of code, such | 164 | * Calls to the kernel memory allocator can not be made while holding |
161 | * as when reading out possibly multi-word cpumasks and nodemasks, as | 165 | * callback_sem, as that would risk double tripping on callback_sem |
162 | * the risks are less, and the desire for performance a little greater. | 166 | * from one of the callbacks into the cpuset code from within |
163 | * The proc_cpuset_show() routine needs to hold cpuset_sem to insure | 167 | * __alloc_pages(). |
164 | * that no cs->dentry is NULL, as it walks up the cpuset tree to root. | 168 | * |
165 | * | 169 | * If a task is only holding callback_sem, then it has read-only |
166 | * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't | 170 | * access to cpusets. |
167 | * (usually) grab cpuset_sem. These are the two most performance | 171 | * |
168 | * critical pieces of code here. The exception occurs on exit(), | 172 | * The task_struct fields mems_allowed and mems_generation may only |
169 | * when a task in a notify_on_release cpuset exits. Then cpuset_sem | 173 | * be accessed in the context of that task, so require no locks. |
174 | * | ||
175 | * Any task can increment and decrement the count field without lock. | ||
176 | * So in general, code holding manage_sem or callback_sem can't rely | ||
177 | * on the count field not changing. However, if the count goes to | ||
178 | * zero, then only attach_task(), which holds both semaphores, can | ||
179 | * increment it again. Because a count of zero means that no tasks | ||
180 | * are currently attached, therefore there is no way a task attached | ||
181 | * to that cpuset can fork (the other way to increment the count). | ||
182 | * So code holding manage_sem or callback_sem can safely assume that | ||
183 | * if the count is zero, it will stay zero. Similarly, if a task | ||
184 | * holds manage_sem or callback_sem on a cpuset with zero count, it | ||
185 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
186 | * both of those semaphores. | ||
187 | * | ||
188 | * A possible optimization to improve parallelism would be to make | ||
189 | * callback_sem a R/W semaphore (rwsem), allowing the callback routines | ||
190 | * to proceed in parallel, with read access, until the holder of | ||
191 | * manage_sem needed to take this rwsem for exclusive write access | ||
192 | * and modify some cpusets. | ||
193 | * | ||
194 | * The cpuset_common_file_write handler for operations that modify | ||
195 | * the cpuset hierarchy holds manage_sem across the entire operation, | ||
196 | * single threading all such cpuset modifications across the system. | ||
197 | * | ||
198 | * The cpuset_common_file_read() handlers only hold callback_sem across | ||
199 | * small pieces of code, such as when reading out possibly multi-word | ||
200 | * cpumasks and nodemasks. | ||
201 | * | ||
202 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | ||
203 | * (usually) take either semaphore. These are the two most performance | ||
204 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
205 | * when a task in a notify_on_release cpuset exits. Then manage_sem | ||
170 | * is taken, and if the cpuset count is zero, a usermode call made | 206 | * is taken, and if the cpuset count is zero, a usermode call made |
171 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | 207 | * to /sbin/cpuset_release_agent with the name of the cpuset (path |
172 | * relative to the root of cpuset file system) as the argument. | 208 | * relative to the root of cpuset file system) as the argument. |
173 | * | 209 | * |
174 | * A cpuset can only be deleted if both its 'count' of using tasks is | 210 | * A cpuset can only be deleted if both its 'count' of using tasks |
175 | * zero, and its list of 'children' cpusets is empty. Since all tasks | 211 | * is zero, and its list of 'children' cpusets is empty. Since all |
176 | * in the system use _some_ cpuset, and since there is always at least | 212 | * tasks in the system use _some_ cpuset, and since there is always at |
177 | * one task in the system (init, pid == 1), therefore, top_cpuset | 213 | * least one task in the system (init, pid == 1), therefore, top_cpuset |
178 | * always has either children cpusets and/or using tasks. So no need | 214 | * always has either children cpusets and/or using tasks. So we don't |
179 | * for any special hack to ensure that top_cpuset cannot be deleted. | 215 | * need a special hack to ensure that top_cpuset cannot be deleted. |
216 | * | ||
217 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
218 | * | ||
219 | * The task_lock() exception | ||
220 | * | ||
221 | * The need for this exception arises from the action of attach_task(), | ||
222 | * which overwrites one tasks cpuset pointer with another. It does | ||
223 | * so using both semaphores, however there are several performance | ||
224 | * critical places that need to reference task->cpuset without the | ||
225 | * expense of grabbing a system global semaphore. Therefore except as | ||
226 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
228 | * (task->alloc_lock) already in the task_struct routinely used for | ||
229 | * such matters. | ||
180 | */ | 230 | */ |
181 | 231 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 232 | static DECLARE_MUTEX(manage_sem); |
183 | static struct task_struct *cpuset_sem_owner; | 233 | static DECLARE_MUTEX(callback_sem); |
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
219 | 234 | ||
220 | /* | 235 | /* |
221 | * A couple of forward declarations required, due to cyclic reference loop: | 236 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) | |||
390 | } | 405 | } |
391 | 406 | ||
392 | /* | 407 | /* |
393 | * Call with cpuset_sem held. Writes path of cpuset into buf. | 408 | * Call with manage_sem held. Writes path of cpuset into buf. |
394 | * Returns 0 on success, -errno on error. | 409 | * Returns 0 on success, -errno on error. |
395 | */ | 410 | */ |
396 | 411 | ||
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
442 | * status of the /sbin/cpuset_release_agent task, so no sense holding | 457 | * status of the /sbin/cpuset_release_agent task, so no sense holding |
443 | * our caller up for that. | 458 | * our caller up for that. |
444 | * | 459 | * |
445 | * The simple act of forking that task might require more memory, | 460 | * When we had only one cpuset semaphore, we had to call this |
446 | * which might need cpuset_sem. So this routine must be called while | 461 | * without holding it, to avoid deadlock when call_usermodehelper() |
447 | * cpuset_sem is not held, to avoid a possible deadlock. See also | 462 | * allocated memory. With two locks, we could now call this while |
448 | * comments for check_for_release(), below. | 463 | * holding manage_sem, but we still don't, so as to minimize |
464 | * the time manage_sem is held. | ||
449 | */ | 465 | */ |
450 | 466 | ||
451 | static void cpuset_release_agent(const char *pathbuf) | 467 | static void cpuset_release_agent(const char *pathbuf) |
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf) | |||
477 | * cs is notify_on_release() and now both the user count is zero and | 493 | * cs is notify_on_release() and now both the user count is zero and |
478 | * the list of children is empty, prepare cpuset path in a kmalloc'd | 494 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
479 | * buffer, to be returned via ppathbuf, so that the caller can invoke | 495 | * buffer, to be returned via ppathbuf, so that the caller can invoke |
480 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | 496 | * cpuset_release_agent() with it later on, once manage_sem is dropped. |
481 | * Call here with cpuset_sem held. | 497 | * Call here with manage_sem held. |
482 | * | 498 | * |
483 | * This check_for_release() routine is responsible for kmalloc'ing | 499 | * This check_for_release() routine is responsible for kmalloc'ing |
484 | * pathbuf. The above cpuset_release_agent() is responsible for | 500 | * pathbuf. The above cpuset_release_agent() is responsible for |
485 | * kfree'ing pathbuf. The caller of these routines is responsible | 501 | * kfree'ing pathbuf. The caller of these routines is responsible |
486 | * for providing a pathbuf pointer, initialized to NULL, then | 502 | * for providing a pathbuf pointer, initialized to NULL, then |
487 | * calling check_for_release() with cpuset_sem held and the address | 503 | * calling check_for_release() with manage_sem held and the address |
488 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | 504 | * of the pathbuf pointer, then dropping manage_sem, then calling |
489 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | 505 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). |
490 | */ | 506 | */ |
491 | 507 | ||
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf) | |||
516 | * One way or another, we guarantee to return some non-empty subset | 532 | * One way or another, we guarantee to return some non-empty subset |
517 | * of cpu_online_map. | 533 | * of cpu_online_map. |
518 | * | 534 | * |
519 | * Call with cpuset_sem held. | 535 | * Call with callback_sem held. |
520 | */ | 536 | */ |
521 | 537 | ||
522 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | 538 | static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) |
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) | |||
540 | * One way or another, we guarantee to return some non-empty subset | 556 | * One way or another, we guarantee to return some non-empty subset |
541 | * of node_online_map. | 557 | * of node_online_map. |
542 | * | 558 | * |
543 | * Call with cpuset_sem held. | 559 | * Call with callback_sem held. |
544 | */ | 560 | */ |
545 | 561 | ||
546 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 562 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
555 | } | 571 | } |
556 | 572 | ||
557 | /* | 573 | /* |
558 | * Refresh current tasks mems_allowed and mems_generation from | 574 | * Refresh current tasks mems_allowed and mems_generation from current |
559 | * current tasks cpuset. Call with cpuset_sem held. | 575 | * tasks cpuset. |
576 | * | ||
577 | * Call without callback_sem or task_lock() held. May be called with | ||
578 | * or without manage_sem held. Will acquire task_lock() and might | ||
579 | * acquire callback_sem during call. | ||
580 | * | ||
581 | * The task_lock() is required to dereference current->cpuset safely. | ||
582 | * Without it, we could pick up the pointer value of current->cpuset | ||
583 | * in one instruction, and then attach_task could give us a different | ||
584 | * cpuset, and then the cpuset we had could be removed and freed, | ||
585 | * and then on our next instruction, we could dereference a no longer | ||
586 | * valid cpuset pointer to get its mems_generation field. | ||
560 | * | 587 | * |
561 | * This routine is needed to update the per-task mems_allowed | 588 | * This routine is needed to update the per-task mems_allowed data, |
562 | * data, within the tasks context, when it is trying to allocate | 589 | * within the tasks context, when it is trying to allocate memory |
563 | * memory (in various mm/mempolicy.c routines) and notices | 590 | * (in various mm/mempolicy.c routines) and notices that some other |
564 | * that some other task has been modifying its cpuset. | 591 | * task has been modifying its cpuset. |
565 | */ | 592 | */ |
566 | 593 | ||
567 | static void refresh_mems(void) | 594 | static void refresh_mems(void) |
568 | { | 595 | { |
569 | struct cpuset *cs = current->cpuset; | 596 | int my_cpusets_mem_gen; |
570 | 597 | ||
571 | if (current->cpuset_mems_generation != cs->mems_generation) { | 598 | task_lock(current); |
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | ||
600 | task_unlock(current); | ||
601 | |||
602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | ||
603 | struct cpuset *cs; | ||
604 | nodemask_t oldmem = current->mems_allowed; | ||
605 | |||
606 | down(&callback_sem); | ||
607 | task_lock(current); | ||
608 | cs = current->cpuset; | ||
572 | guarantee_online_mems(cs, ¤t->mems_allowed); | 609 | guarantee_online_mems(cs, ¤t->mems_allowed); |
573 | current->cpuset_mems_generation = cs->mems_generation; | 610 | current->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | ||
612 | up(&callback_sem); | ||
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | ||
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
574 | } | 615 | } |
575 | } | 616 | } |
576 | 617 | ||
@@ -579,7 +620,7 @@ static void refresh_mems(void) | |||
579 | * | 620 | * |
580 | * One cpuset is a subset of another if all its allowed CPUs and | 621 | * One cpuset is a subset of another if all its allowed CPUs and |
581 | * Memory Nodes are a subset of the other, and its exclusive flags | 622 | * Memory Nodes are a subset of the other, and its exclusive flags |
582 | * are only set if the other's are set. | 623 | * are only set if the other's are set. Call holding manage_sem. |
583 | */ | 624 | */ |
584 | 625 | ||
585 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 626 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
597 | * If we replaced the flag and mask values of the current cpuset | 638 | * If we replaced the flag and mask values of the current cpuset |
598 | * (cur) with those values in the trial cpuset (trial), would | 639 | * (cur) with those values in the trial cpuset (trial), would |
599 | * our various subset and exclusive rules still be valid? Presumes | 640 | * our various subset and exclusive rules still be valid? Presumes |
600 | * cpuset_sem held. | 641 | * manage_sem held. |
601 | * | 642 | * |
602 | * 'cur' is the address of an actual, in-use cpuset. Operations | 643 | * 'cur' is the address of an actual, in-use cpuset. Operations |
603 | * such as list traversal that depend on the actual address of the | 644 | * such as list traversal that depend on the actual address of the |
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
651 | * exclusive child cpusets | 692 | * exclusive child cpusets |
652 | * Build these two partitions by calling partition_sched_domains | 693 | * Build these two partitions by calling partition_sched_domains |
653 | * | 694 | * |
654 | * Call with cpuset_sem held. May nest a call to the | 695 | * Call with manage_sem held. May nest a call to the |
655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 696 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
656 | */ | 697 | */ |
657 | 698 | ||
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur) | |||
696 | unlock_cpu_hotplug(); | 737 | unlock_cpu_hotplug(); |
697 | } | 738 | } |
698 | 739 | ||
740 | /* | ||
741 | * Call with manage_sem held. May take callback_sem during call. | ||
742 | */ | ||
743 | |||
699 | static int update_cpumask(struct cpuset *cs, char *buf) | 744 | static int update_cpumask(struct cpuset *cs, char *buf) |
700 | { | 745 | { |
701 | struct cpuset trialcs; | 746 | struct cpuset trialcs; |
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
712 | if (retval < 0) | 757 | if (retval < 0) |
713 | return retval; | 758 | return retval; |
714 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | 759 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
760 | down(&callback_sem); | ||
715 | cs->cpus_allowed = trialcs.cpus_allowed; | 761 | cs->cpus_allowed = trialcs.cpus_allowed; |
762 | up(&callback_sem); | ||
716 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | 763 | if (is_cpu_exclusive(cs) && !cpus_unchanged) |
717 | update_cpu_domains(cs); | 764 | update_cpu_domains(cs); |
718 | return 0; | 765 | return 0; |
719 | } | 766 | } |
720 | 767 | ||
768 | /* | ||
769 | * Call with manage_sem held. May take callback_sem during call. | ||
770 | */ | ||
771 | |||
721 | static int update_nodemask(struct cpuset *cs, char *buf) | 772 | static int update_nodemask(struct cpuset *cs, char *buf) |
722 | { | 773 | { |
723 | struct cpuset trialcs; | 774 | struct cpuset trialcs; |
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
732 | return -ENOSPC; | 783 | return -ENOSPC; |
733 | retval = validate_change(cs, &trialcs); | 784 | retval = validate_change(cs, &trialcs); |
734 | if (retval == 0) { | 785 | if (retval == 0) { |
786 | down(&callback_sem); | ||
735 | cs->mems_allowed = trialcs.mems_allowed; | 787 | cs->mems_allowed = trialcs.mems_allowed; |
736 | atomic_inc(&cpuset_mems_generation); | 788 | atomic_inc(&cpuset_mems_generation); |
737 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
790 | up(&callback_sem); | ||
738 | } | 791 | } |
739 | return retval; | 792 | return retval; |
740 | } | 793 | } |
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
745 | * CS_NOTIFY_ON_RELEASE) | 798 | * CS_NOTIFY_ON_RELEASE) |
746 | * cs: the cpuset to update | 799 | * cs: the cpuset to update |
747 | * buf: the buffer where we read the 0 or 1 | 800 | * buf: the buffer where we read the 0 or 1 |
801 | * | ||
802 | * Call with manage_sem held. | ||
748 | */ | 803 | */ |
749 | 804 | ||
750 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 805 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
766 | return err; | 821 | return err; |
767 | cpu_exclusive_changed = | 822 | cpu_exclusive_changed = |
768 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 823 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
824 | down(&callback_sem); | ||
769 | if (turning_on) | 825 | if (turning_on) |
770 | set_bit(bit, &cs->flags); | 826 | set_bit(bit, &cs->flags); |
771 | else | 827 | else |
772 | clear_bit(bit, &cs->flags); | 828 | clear_bit(bit, &cs->flags); |
829 | up(&callback_sem); | ||
773 | 830 | ||
774 | if (cpu_exclusive_changed) | 831 | if (cpu_exclusive_changed) |
775 | update_cpu_domains(cs); | 832 | update_cpu_domains(cs); |
776 | return 0; | 833 | return 0; |
777 | } | 834 | } |
778 | 835 | ||
836 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | ||
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | ||
839 | * notified on release. | ||
840 | * | ||
841 | * Call holding manage_sem. May take callback_sem and task_lock of | ||
842 | * the task 'pid' during call. | ||
843 | */ | ||
844 | |||
779 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | 845 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
780 | { | 846 | { |
781 | pid_t pid; | 847 | pid_t pid; |
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
792 | read_lock(&tasklist_lock); | 858 | read_lock(&tasklist_lock); |
793 | 859 | ||
794 | tsk = find_task_by_pid(pid); | 860 | tsk = find_task_by_pid(pid); |
795 | if (!tsk) { | 861 | if (!tsk || tsk->flags & PF_EXITING) { |
796 | read_unlock(&tasklist_lock); | 862 | read_unlock(&tasklist_lock); |
797 | return -ESRCH; | 863 | return -ESRCH; |
798 | } | 864 | } |
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
810 | get_task_struct(tsk); | 876 | get_task_struct(tsk); |
811 | } | 877 | } |
812 | 878 | ||
879 | down(&callback_sem); | ||
880 | |||
813 | task_lock(tsk); | 881 | task_lock(tsk); |
814 | oldcs = tsk->cpuset; | 882 | oldcs = tsk->cpuset; |
815 | if (!oldcs) { | 883 | if (!oldcs) { |
816 | task_unlock(tsk); | 884 | task_unlock(tsk); |
885 | up(&callback_sem); | ||
817 | put_task_struct(tsk); | 886 | put_task_struct(tsk); |
818 | return -ESRCH; | 887 | return -ESRCH; |
819 | } | 888 | } |
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
824 | guarantee_online_cpus(cs, &cpus); | 893 | guarantee_online_cpus(cs, &cpus); |
825 | set_cpus_allowed(tsk, cpus); | 894 | set_cpus_allowed(tsk, cpus); |
826 | 895 | ||
896 | up(&callback_sem); | ||
827 | put_task_struct(tsk); | 897 | put_task_struct(tsk); |
828 | if (atomic_dec_and_test(&oldcs->count)) | 898 | if (atomic_dec_and_test(&oldcs->count)) |
829 | check_for_release(oldcs, ppathbuf); | 899 | check_for_release(oldcs, ppathbuf); |
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
867 | } | 937 | } |
868 | buffer[nbytes] = 0; /* nul-terminate */ | 938 | buffer[nbytes] = 0; /* nul-terminate */ |
869 | 939 | ||
870 | cpuset_down(&cpuset_sem); | 940 | down(&manage_sem); |
871 | 941 | ||
872 | if (is_removed(cs)) { | 942 | if (is_removed(cs)) { |
873 | retval = -ENODEV; | 943 | retval = -ENODEV; |
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
901 | if (retval == 0) | 971 | if (retval == 0) |
902 | retval = nbytes; | 972 | retval = nbytes; |
903 | out2: | 973 | out2: |
904 | cpuset_up(&cpuset_sem); | 974 | up(&manage_sem); |
905 | cpuset_release_agent(pathbuf); | 975 | cpuset_release_agent(pathbuf); |
906 | out1: | 976 | out1: |
907 | kfree(buffer); | 977 | kfree(buffer); |
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
941 | { | 1011 | { |
942 | cpumask_t mask; | 1012 | cpumask_t mask; |
943 | 1013 | ||
944 | cpuset_down(&cpuset_sem); | 1014 | down(&callback_sem); |
945 | mask = cs->cpus_allowed; | 1015 | mask = cs->cpus_allowed; |
946 | cpuset_up(&cpuset_sem); | 1016 | up(&callback_sem); |
947 | 1017 | ||
948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 1018 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
949 | } | 1019 | } |
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
952 | { | 1022 | { |
953 | nodemask_t mask; | 1023 | nodemask_t mask; |
954 | 1024 | ||
955 | cpuset_down(&cpuset_sem); | 1025 | down(&callback_sem); |
956 | mask = cs->mems_allowed; | 1026 | mask = cs->mems_allowed; |
957 | cpuset_up(&cpuset_sem); | 1027 | up(&callback_sem); |
958 | 1028 | ||
959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 1029 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
960 | } | 1030 | } |
@@ -968,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
968 | char *page; | 1038 | char *page; |
969 | ssize_t retval = 0; | 1039 | ssize_t retval = 0; |
970 | char *s; | 1040 | char *s; |
971 | char *start; | ||
972 | size_t n; | ||
973 | 1041 | ||
974 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) | 1042 | if (!(page = (char *)__get_free_page(GFP_KERNEL))) |
975 | return -ENOMEM; | 1043 | return -ENOMEM; |
@@ -997,16 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
997 | goto out; | 1065 | goto out; |
998 | } | 1066 | } |
999 | *s++ = '\n'; | 1067 | *s++ = '\n'; |
1000 | *s = '\0'; | ||
1001 | 1068 | ||
1002 | /* Do nothing if *ppos is at the eof or beyond the eof. */ | 1069 | retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); |
1003 | if (s - page <= *ppos) | ||
1004 | return 0; | ||
1005 | |||
1006 | start = page + *ppos; | ||
1007 | n = s - start; | ||
1008 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | ||
1009 | *ppos += retval; | ||
1010 | out: | 1070 | out: |
1011 | free_page((unsigned long)page); | 1071 | free_page((unsigned long)page); |
1012 | return retval; | 1072 | return retval; |
@@ -1057,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file) | |||
1057 | return 0; | 1117 | return 0; |
1058 | } | 1118 | } |
1059 | 1119 | ||
1120 | /* | ||
1121 | * cpuset_rename - Only allow simple rename of directories in place. | ||
1122 | */ | ||
1123 | static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | ||
1124 | struct inode *new_dir, struct dentry *new_dentry) | ||
1125 | { | ||
1126 | if (!S_ISDIR(old_dentry->d_inode->i_mode)) | ||
1127 | return -ENOTDIR; | ||
1128 | if (new_dentry->d_inode) | ||
1129 | return -EEXIST; | ||
1130 | if (old_dir != new_dir) | ||
1131 | return -EIO; | ||
1132 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | ||
1133 | } | ||
1134 | |||
1060 | static struct file_operations cpuset_file_operations = { | 1135 | static struct file_operations cpuset_file_operations = { |
1061 | .read = cpuset_file_read, | 1136 | .read = cpuset_file_read, |
1062 | .write = cpuset_file_write, | 1137 | .write = cpuset_file_write, |
@@ -1069,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = { | |||
1069 | .lookup = simple_lookup, | 1144 | .lookup = simple_lookup, |
1070 | .mkdir = cpuset_mkdir, | 1145 | .mkdir = cpuset_mkdir, |
1071 | .rmdir = cpuset_rmdir, | 1146 | .rmdir = cpuset_rmdir, |
1147 | .rename = cpuset_rename, | ||
1072 | }; | 1148 | }; |
1073 | 1149 | ||
1074 | static int cpuset_create_file(struct dentry *dentry, int mode) | 1150 | static int cpuset_create_file(struct dentry *dentry, int mode) |
@@ -1172,7 +1248,9 @@ struct ctr_struct { | |||
1172 | 1248 | ||
1173 | /* | 1249 | /* |
1174 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. | 1250 | * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. |
1175 | * Return actual number of pids loaded. | 1251 | * Return actual number of pids loaded. No need to task_lock(p) |
1252 | * when reading out p->cpuset, as we don't really care if it changes | ||
1253 | * on the next cycle, and we are not going to try to dereference it. | ||
1176 | */ | 1254 | */ |
1177 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1178 | { | 1256 | { |
@@ -1214,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1214 | return cnt; | 1292 | return cnt; |
1215 | } | 1293 | } |
1216 | 1294 | ||
1295 | /* | ||
1296 | * Handle an open on 'tasks' file. Prepare a buffer listing the | ||
1297 | * process id's of tasks currently attached to the cpuset being opened. | ||
1298 | * | ||
1299 | * Does not require any specific cpuset semaphores, and does not take any. | ||
1300 | */ | ||
1217 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1301 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1218 | { | 1302 | { |
1219 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1303 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); |
@@ -1361,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1361 | if (!cs) | 1445 | if (!cs) |
1362 | return -ENOMEM; | 1446 | return -ENOMEM; |
1363 | 1447 | ||
1364 | cpuset_down(&cpuset_sem); | 1448 | down(&manage_sem); |
1449 | refresh_mems(); | ||
1365 | cs->flags = 0; | 1450 | cs->flags = 0; |
1366 | if (notify_on_release(parent)) | 1451 | if (notify_on_release(parent)) |
1367 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1375,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1375 | 1460 | ||
1376 | cs->parent = parent; | 1461 | cs->parent = parent; |
1377 | 1462 | ||
1463 | down(&callback_sem); | ||
1378 | list_add(&cs->sibling, &cs->parent->children); | 1464 | list_add(&cs->sibling, &cs->parent->children); |
1465 | up(&callback_sem); | ||
1379 | 1466 | ||
1380 | err = cpuset_create_dir(cs, name, mode); | 1467 | err = cpuset_create_dir(cs, name, mode); |
1381 | if (err < 0) | 1468 | if (err < 0) |
1382 | goto err; | 1469 | goto err; |
1383 | 1470 | ||
1384 | /* | 1471 | /* |
1385 | * Release cpuset_sem before cpuset_populate_dir() because it | 1472 | * Release manage_sem before cpuset_populate_dir() because it |
1386 | * will down() this new directory's i_sem and if we race with | 1473 | * will down() this new directory's i_sem and if we race with |
1387 | * another mkdir, we might deadlock. | 1474 | * another mkdir, we might deadlock. |
1388 | */ | 1475 | */ |
1389 | cpuset_up(&cpuset_sem); | 1476 | up(&manage_sem); |
1390 | 1477 | ||
1391 | err = cpuset_populate_dir(cs->dentry); | 1478 | err = cpuset_populate_dir(cs->dentry); |
1392 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1479 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1393 | return 0; | 1480 | return 0; |
1394 | err: | 1481 | err: |
1395 | list_del(&cs->sibling); | 1482 | list_del(&cs->sibling); |
1396 | cpuset_up(&cpuset_sem); | 1483 | up(&manage_sem); |
1397 | kfree(cs); | 1484 | kfree(cs); |
1398 | return err; | 1485 | return err; |
1399 | } | 1486 | } |
@@ -1415,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1415 | 1502 | ||
1416 | /* the vfs holds both inode->i_sem already */ | 1503 | /* the vfs holds both inode->i_sem already */ |
1417 | 1504 | ||
1418 | cpuset_down(&cpuset_sem); | 1505 | down(&manage_sem); |
1506 | refresh_mems(); | ||
1419 | if (atomic_read(&cs->count) > 0) { | 1507 | if (atomic_read(&cs->count) > 0) { |
1420 | cpuset_up(&cpuset_sem); | 1508 | up(&manage_sem); |
1421 | return -EBUSY; | 1509 | return -EBUSY; |
1422 | } | 1510 | } |
1423 | if (!list_empty(&cs->children)) { | 1511 | if (!list_empty(&cs->children)) { |
1424 | cpuset_up(&cpuset_sem); | 1512 | up(&manage_sem); |
1425 | return -EBUSY; | 1513 | return -EBUSY; |
1426 | } | 1514 | } |
1427 | parent = cs->parent; | 1515 | parent = cs->parent; |
1516 | down(&callback_sem); | ||
1428 | set_bit(CS_REMOVED, &cs->flags); | 1517 | set_bit(CS_REMOVED, &cs->flags); |
1429 | if (is_cpu_exclusive(cs)) | 1518 | if (is_cpu_exclusive(cs)) |
1430 | update_cpu_domains(cs); | 1519 | update_cpu_domains(cs); |
1431 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1520 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1432 | if (list_empty(&parent->children)) | ||
1433 | check_for_release(parent, &pathbuf); | ||
1434 | spin_lock(&cs->dentry->d_lock); | 1521 | spin_lock(&cs->dentry->d_lock); |
1435 | d = dget(cs->dentry); | 1522 | d = dget(cs->dentry); |
1436 | cs->dentry = NULL; | 1523 | cs->dentry = NULL; |
1437 | spin_unlock(&d->d_lock); | 1524 | spin_unlock(&d->d_lock); |
1438 | cpuset_d_remove_dir(d); | 1525 | cpuset_d_remove_dir(d); |
1439 | dput(d); | 1526 | dput(d); |
1440 | cpuset_up(&cpuset_sem); | 1527 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | ||
1529 | check_for_release(parent, &pathbuf); | ||
1530 | up(&manage_sem); | ||
1441 | cpuset_release_agent(pathbuf); | 1531 | cpuset_release_agent(pathbuf); |
1442 | return 0; | 1532 | return 0; |
1443 | } | 1533 | } |
@@ -1497,16 +1587,26 @@ void __init cpuset_init_smp(void) | |||
1497 | * cpuset_fork - attach newly forked task to its parents cpuset. | 1587 | * cpuset_fork - attach newly forked task to its parents cpuset. |
1498 | * @tsk: pointer to task_struct of forking parent process. | 1588 | * @tsk: pointer to task_struct of forking parent process. |
1499 | * | 1589 | * |
1500 | * Description: By default, on fork, a task inherits its | 1590 | * Description: A task inherits its parent's cpuset at fork(). |
1501 | * parent's cpuset. The pointer to the shared cpuset is | 1591 | * |
1502 | * automatically copied in fork.c by dup_task_struct(). | 1592 | * A pointer to the shared cpuset was automatically copied in fork.c |
1503 | * This cpuset_fork() routine need only increment the usage | 1593 | * by dup_task_struct(). However, we ignore that copy, since it was |
1504 | * counter in that cpuset. | 1594 | * not made under the protection of task_lock(), so might no longer be |
1595 | * a valid cpuset pointer. attach_task() might have already changed | ||
1596 | * current->cpuset, allowing the previously referenced cpuset to | ||
1597 | * be removed and freed. Instead, we task_lock(current) and copy | ||
1598 | * its present value of current->cpuset for our freshly forked child. | ||
1599 | * | ||
1600 | * At the point that cpuset_fork() is called, 'current' is the parent | ||
1601 | * task, and the passed argument 'child' points to the child task. | ||
1505 | **/ | 1602 | **/ |
1506 | 1603 | ||
1507 | void cpuset_fork(struct task_struct *tsk) | 1604 | void cpuset_fork(struct task_struct *child) |
1508 | { | 1605 | { |
1509 | atomic_inc(&tsk->cpuset->count); | 1606 | task_lock(current); |
1607 | child->cpuset = current->cpuset; | ||
1608 | atomic_inc(&child->cpuset->count); | ||
1609 | task_unlock(current); | ||
1510 | } | 1610 | } |
1511 | 1611 | ||
1512 | /** | 1612 | /** |
@@ -1515,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk) | |||
1515 | * | 1615 | * |
1516 | * Description: Detach cpuset from @tsk and release it. | 1616 | * Description: Detach cpuset from @tsk and release it. |
1517 | * | 1617 | * |
1518 | * Note that cpusets marked notify_on_release force every task | 1618 | * Note that cpusets marked notify_on_release force every task in |
1519 | * in them to take the global cpuset_sem semaphore when exiting. | 1619 | * them to take the global manage_sem semaphore when exiting. |
1520 | * This could impact scaling on very large systems. Be reluctant | 1620 | * This could impact scaling on very large systems. Be reluctant to |
1521 | * to use notify_on_release cpusets where very high task exit | 1621 | * use notify_on_release cpusets where very high task exit scaling |
1522 | * scaling is required on large systems. | 1622 | * is required on large systems. |
1523 | * | 1623 | * |
1524 | * Don't even think about derefencing 'cs' after the cpuset use | 1624 | * Don't even think about derefencing 'cs' after the cpuset use count |
1525 | * count goes to zero, except inside a critical section guarded | 1625 | * goes to zero, except inside a critical section guarded by manage_sem |
1526 | * by the cpuset_sem semaphore. If you don't hold cpuset_sem, | 1626 | * or callback_sem. Otherwise a zero cpuset use count is a license to |
1527 | * then a zero cpuset use count is a license to any other task to | 1627 | * any other task to nuke the cpuset immediately, via cpuset_rmdir(). |
1528 | * nuke the cpuset immediately. | 1628 | * |
1629 | * This routine has to take manage_sem, not callback_sem, because | ||
1630 | * it is holding that semaphore while calling check_for_release(), | ||
1631 | * which calls kmalloc(), so can't be called holding callback__sem(). | ||
1632 | * | ||
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | ||
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | ||
1635 | * mess with it. | ||
1529 | **/ | 1636 | **/ |
1530 | 1637 | ||
1531 | void cpuset_exit(struct task_struct *tsk) | 1638 | void cpuset_exit(struct task_struct *tsk) |
1532 | { | 1639 | { |
1533 | struct cpuset *cs; | 1640 | struct cpuset *cs; |
1534 | 1641 | ||
1535 | task_lock(tsk); | 1642 | BUG_ON(!(tsk->flags & PF_EXITING)); |
1643 | |||
1536 | cs = tsk->cpuset; | 1644 | cs = tsk->cpuset; |
1537 | tsk->cpuset = NULL; | 1645 | tsk->cpuset = NULL; |
1538 | task_unlock(tsk); | ||
1539 | 1646 | ||
1540 | if (notify_on_release(cs)) { | 1647 | if (notify_on_release(cs)) { |
1541 | char *pathbuf = NULL; | 1648 | char *pathbuf = NULL; |
1542 | 1649 | ||
1543 | cpuset_down(&cpuset_sem); | 1650 | down(&manage_sem); |
1544 | if (atomic_dec_and_test(&cs->count)) | 1651 | if (atomic_dec_and_test(&cs->count)) |
1545 | check_for_release(cs, &pathbuf); | 1652 | check_for_release(cs, &pathbuf); |
1546 | cpuset_up(&cpuset_sem); | 1653 | up(&manage_sem); |
1547 | cpuset_release_agent(pathbuf); | 1654 | cpuset_release_agent(pathbuf); |
1548 | } else { | 1655 | } else { |
1549 | atomic_dec(&cs->count); | 1656 | atomic_dec(&cs->count); |
@@ -1564,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1564 | { | 1671 | { |
1565 | cpumask_t mask; | 1672 | cpumask_t mask; |
1566 | 1673 | ||
1567 | cpuset_down(&cpuset_sem); | 1674 | down(&callback_sem); |
1568 | task_lock((struct task_struct *)tsk); | 1675 | task_lock((struct task_struct *)tsk); |
1569 | guarantee_online_cpus(tsk->cpuset, &mask); | 1676 | guarantee_online_cpus(tsk->cpuset, &mask); |
1570 | task_unlock((struct task_struct *)tsk); | 1677 | task_unlock((struct task_struct *)tsk); |
1571 | cpuset_up(&cpuset_sem); | 1678 | up(&callback_sem); |
1572 | 1679 | ||
1573 | return mask; | 1680 | return mask; |
1574 | } | 1681 | } |
@@ -1584,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void) | |||
1584 | * If the current tasks cpusets mems_allowed changed behind our backs, | 1691 | * If the current tasks cpusets mems_allowed changed behind our backs, |
1585 | * update current->mems_allowed and mems_generation to the new value. | 1692 | * update current->mems_allowed and mems_generation to the new value. |
1586 | * Do not call this routine if in_interrupt(). | 1693 | * Do not call this routine if in_interrupt(). |
1694 | * | ||
1695 | * Call without callback_sem or task_lock() held. May be called | ||
1696 | * with or without manage_sem held. Unless exiting, it will acquire | ||
1697 | * task_lock(). Also might acquire callback_sem during call to | ||
1698 | * refresh_mems(). | ||
1587 | */ | 1699 | */ |
1588 | 1700 | ||
1589 | void cpuset_update_current_mems_allowed(void) | 1701 | void cpuset_update_current_mems_allowed(void) |
1590 | { | 1702 | { |
1591 | struct cpuset *cs = current->cpuset; | 1703 | struct cpuset *cs; |
1704 | int need_to_refresh = 0; | ||
1592 | 1705 | ||
1706 | task_lock(current); | ||
1707 | cs = current->cpuset; | ||
1593 | if (!cs) | 1708 | if (!cs) |
1594 | return; /* task is exiting */ | 1709 | goto done; |
1595 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1710 | if (current->cpuset_mems_generation != cs->mems_generation) |
1596 | cpuset_down(&cpuset_sem); | 1711 | need_to_refresh = 1; |
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1597 | refresh_mems(); | 1715 | refresh_mems(); |
1598 | cpuset_up(&cpuset_sem); | ||
1599 | } | ||
1600 | } | 1716 | } |
1601 | 1717 | ||
1602 | /** | 1718 | /** |
@@ -1630,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1630 | 1746 | ||
1631 | /* | 1747 | /* |
1632 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1748 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive |
1633 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | 1749 | * ancestor to the specified cpuset. Call holding callback_sem. |
1634 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1750 | * If no ancestor is mem_exclusive (an unusual configuration), then |
1635 | * returns the root cpuset. | 1751 | * returns the root cpuset. |
1636 | */ | 1752 | */ |
@@ -1657,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1657 | * GFP_KERNEL allocations are not so marked, so can escape to the | 1773 | * GFP_KERNEL allocations are not so marked, so can escape to the |
1658 | * nearest mem_exclusive ancestor cpuset. | 1774 | * nearest mem_exclusive ancestor cpuset. |
1659 | * | 1775 | * |
1660 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | 1776 | * Scanning up parent cpusets requires callback_sem. The __alloc_pages() |
1661 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 1777 | * routine only calls here with __GFP_HARDWALL bit _not_ set if |
1662 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 1778 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks |
1663 | * mems_allowed came up empty on the first pass over the zonelist. | 1779 | * mems_allowed came up empty on the first pass over the zonelist. |
1664 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 1780 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are |
1665 | * short of memory, might require taking the cpuset_sem semaphore. | 1781 | * short of memory, might require taking the callback_sem semaphore. |
1666 | * | 1782 | * |
1667 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | 1783 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() |
1668 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | 1784 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing |
@@ -1679,7 +1795,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1679 | * GFP_USER - only nodes in current tasks mems allowed ok. | 1795 | * GFP_USER - only nodes in current tasks mems allowed ok. |
1680 | **/ | 1796 | **/ |
1681 | 1797 | ||
1682 | int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | 1798 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
1683 | { | 1799 | { |
1684 | int node; /* node that zone z is on */ | 1800 | int node; /* node that zone z is on */ |
1685 | const struct cpuset *cs; /* current cpuset ancestors */ | 1801 | const struct cpuset *cs; /* current cpuset ancestors */ |
@@ -1693,15 +1809,18 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | |||
1693 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | 1809 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ |
1694 | return 0; | 1810 | return 0; |
1695 | 1811 | ||
1812 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | ||
1813 | return 1; | ||
1814 | |||
1696 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1815 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1697 | cpuset_down(&cpuset_sem); | 1816 | down(&callback_sem); |
1698 | cs = current->cpuset; | 1817 | |
1699 | if (!cs) | 1818 | task_lock(current); |
1700 | goto done; /* current task exiting */ | 1819 | cs = nearest_exclusive_ancestor(current->cpuset); |
1701 | cs = nearest_exclusive_ancestor(cs); | 1820 | task_unlock(current); |
1821 | |||
1702 | allowed = node_isset(node, cs->mems_allowed); | 1822 | allowed = node_isset(node, cs->mems_allowed); |
1703 | done: | 1823 | up(&callback_sem); |
1704 | cpuset_up(&cpuset_sem); | ||
1705 | return allowed; | 1824 | return allowed; |
1706 | } | 1825 | } |
1707 | 1826 | ||
@@ -1714,7 +1833,7 @@ done: | |||
1714 | * determine if task @p's memory usage might impact the memory | 1833 | * determine if task @p's memory usage might impact the memory |
1715 | * available to the current task. | 1834 | * available to the current task. |
1716 | * | 1835 | * |
1717 | * Acquires cpuset_sem - not suitable for calling from a fast path. | 1836 | * Acquires callback_sem - not suitable for calling from a fast path. |
1718 | **/ | 1837 | **/ |
1719 | 1838 | ||
1720 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 1839 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1722,18 +1841,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1722 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1841 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1723 | int overlap = 0; /* do cpusets overlap? */ | 1842 | int overlap = 0; /* do cpusets overlap? */ |
1724 | 1843 | ||
1725 | cpuset_down(&cpuset_sem); | 1844 | down(&callback_sem); |
1726 | cs1 = current->cpuset; | 1845 | |
1727 | if (!cs1) | 1846 | task_lock(current); |
1728 | goto done; /* current task exiting */ | 1847 | if (current->flags & PF_EXITING) { |
1729 | cs2 = p->cpuset; | 1848 | task_unlock(current); |
1730 | if (!cs2) | 1849 | goto done; |
1731 | goto done; /* task p is exiting */ | 1850 | } |
1732 | cs1 = nearest_exclusive_ancestor(cs1); | 1851 | cs1 = nearest_exclusive_ancestor(current->cpuset); |
1733 | cs2 = nearest_exclusive_ancestor(cs2); | 1852 | task_unlock(current); |
1853 | |||
1854 | task_lock((struct task_struct *)p); | ||
1855 | if (p->flags & PF_EXITING) { | ||
1856 | task_unlock((struct task_struct *)p); | ||
1857 | goto done; | ||
1858 | } | ||
1859 | cs2 = nearest_exclusive_ancestor(p->cpuset); | ||
1860 | task_unlock((struct task_struct *)p); | ||
1861 | |||
1734 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1862 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1735 | done: | 1863 | done: |
1736 | cpuset_up(&cpuset_sem); | 1864 | up(&callback_sem); |
1737 | 1865 | ||
1738 | return overlap; | 1866 | return overlap; |
1739 | } | 1867 | } |
@@ -1742,6 +1870,10 @@ done: | |||
1742 | * proc_cpuset_show() | 1870 | * proc_cpuset_show() |
1743 | * - Print tasks cpuset path into seq_file. | 1871 | * - Print tasks cpuset path into seq_file. |
1744 | * - Used for /proc/<pid>/cpuset. | 1872 | * - Used for /proc/<pid>/cpuset. |
1873 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | ||
1874 | * doesn't really matter if tsk->cpuset changes after we read it, | ||
1875 | * and we take manage_sem, keeping attach_task() from changing it | ||
1876 | * anyway. | ||
1745 | */ | 1877 | */ |
1746 | 1878 | ||
1747 | static int proc_cpuset_show(struct seq_file *m, void *v) | 1879 | static int proc_cpuset_show(struct seq_file *m, void *v) |
@@ -1756,10 +1888,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1756 | return -ENOMEM; | 1888 | return -ENOMEM; |
1757 | 1889 | ||
1758 | tsk = m->private; | 1890 | tsk = m->private; |
1759 | cpuset_down(&cpuset_sem); | 1891 | down(&manage_sem); |
1760 | task_lock(tsk); | ||
1761 | cs = tsk->cpuset; | 1892 | cs = tsk->cpuset; |
1762 | task_unlock(tsk); | ||
1763 | if (!cs) { | 1893 | if (!cs) { |
1764 | retval = -EINVAL; | 1894 | retval = -EINVAL; |
1765 | goto out; | 1895 | goto out; |
@@ -1771,7 +1901,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1771 | seq_puts(m, buf); | 1901 | seq_puts(m, buf); |
1772 | seq_putc(m, '\n'); | 1902 | seq_putc(m, '\n'); |
1773 | out: | 1903 | out: |
1774 | cpuset_up(&cpuset_sem); | 1904 | up(&manage_sem); |
1775 | kfree(buf); | 1905 | kfree(buf); |
1776 | return retval; | 1906 | return retval; |
1777 | } | 1907 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index ee6d8b8abef5..ee515683b92d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/cpuset.h> | 28 | #include <linux/cpuset.h> |
29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
30 | #include <linux/signal.h> | 30 | #include <linux/signal.h> |
31 | #include <linux/cn_proc.h> | ||
31 | 32 | ||
32 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
33 | #include <asm/unistd.h> | 34 | #include <asm/unistd.h> |
@@ -547,7 +548,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
547 | 548 | ||
548 | if (p->pdeath_signal) | 549 | if (p->pdeath_signal) |
549 | /* We already hold the tasklist_lock here. */ | 550 | /* We already hold the tasklist_lock here. */ |
550 | group_send_sig_info(p->pdeath_signal, (void *) 0, p); | 551 | group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); |
551 | 552 | ||
552 | /* Move the child from its dying parent to the new one. */ | 553 | /* Move the child from its dying parent to the new one. */ |
553 | if (unlikely(traced)) { | 554 | if (unlikely(traced)) { |
@@ -591,8 +592,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced) | |||
591 | int pgrp = process_group(p); | 592 | int pgrp = process_group(p); |
592 | 593 | ||
593 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 594 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { |
594 | __kill_pg_info(SIGHUP, (void *)1, pgrp); | 595 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
595 | __kill_pg_info(SIGCONT, (void *)1, pgrp); | 596 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
596 | } | 597 | } |
597 | } | 598 | } |
598 | } | 599 | } |
@@ -727,8 +728,8 @@ static void exit_notify(struct task_struct *tsk) | |||
727 | (t->signal->session == tsk->signal->session) && | 728 | (t->signal->session == tsk->signal->session) && |
728 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 729 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
729 | has_stopped_jobs(process_group(tsk))) { | 730 | has_stopped_jobs(process_group(tsk))) { |
730 | __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); | 731 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
731 | __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); | 732 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); |
732 | } | 733 | } |
733 | 734 | ||
734 | /* Let father know we died | 735 | /* Let father know we died |
@@ -783,10 +784,6 @@ static void exit_notify(struct task_struct *tsk) | |||
783 | /* If the process is dead, release it - nobody will wait for it */ | 784 | /* If the process is dead, release it - nobody will wait for it */ |
784 | if (state == EXIT_DEAD) | 785 | if (state == EXIT_DEAD) |
785 | release_task(tsk); | 786 | release_task(tsk); |
786 | |||
787 | /* PF_DEAD causes final put_task_struct after we schedule. */ | ||
788 | preempt_disable(); | ||
789 | tsk->flags |= PF_DEAD; | ||
790 | } | 787 | } |
791 | 788 | ||
792 | fastcall NORET_TYPE void do_exit(long code) | 789 | fastcall NORET_TYPE void do_exit(long code) |
@@ -839,10 +836,14 @@ fastcall NORET_TYPE void do_exit(long code) | |||
839 | preempt_count()); | 836 | preempt_count()); |
840 | 837 | ||
841 | acct_update_integrals(tsk); | 838 | acct_update_integrals(tsk); |
842 | update_mem_hiwater(tsk); | 839 | if (tsk->mm) { |
840 | update_hiwater_rss(tsk->mm); | ||
841 | update_hiwater_vm(tsk->mm); | ||
842 | } | ||
843 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 843 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
844 | if (group_dead) { | 844 | if (group_dead) { |
845 | del_timer_sync(&tsk->signal->real_timer); | 845 | del_timer_sync(&tsk->signal->real_timer); |
846 | exit_itimers(tsk->signal); | ||
846 | acct_process(code); | 847 | acct_process(code); |
847 | } | 848 | } |
848 | exit_mm(tsk); | 849 | exit_mm(tsk); |
@@ -858,18 +859,23 @@ fastcall NORET_TYPE void do_exit(long code) | |||
858 | if (group_dead && tsk->signal->leader) | 859 | if (group_dead && tsk->signal->leader) |
859 | disassociate_ctty(1); | 860 | disassociate_ctty(1); |
860 | 861 | ||
861 | module_put(tsk->thread_info->exec_domain->module); | 862 | module_put(task_thread_info(tsk)->exec_domain->module); |
862 | if (tsk->binfmt) | 863 | if (tsk->binfmt) |
863 | module_put(tsk->binfmt->module); | 864 | module_put(tsk->binfmt->module); |
864 | 865 | ||
865 | tsk->exit_code = code; | 866 | tsk->exit_code = code; |
867 | proc_exit_connector(tsk); | ||
866 | exit_notify(tsk); | 868 | exit_notify(tsk); |
867 | #ifdef CONFIG_NUMA | 869 | #ifdef CONFIG_NUMA |
868 | mpol_free(tsk->mempolicy); | 870 | mpol_free(tsk->mempolicy); |
869 | tsk->mempolicy = NULL; | 871 | tsk->mempolicy = NULL; |
870 | #endif | 872 | #endif |
871 | 873 | ||
872 | BUG_ON(!(current->flags & PF_DEAD)); | 874 | /* PF_DEAD causes final put_task_struct after we schedule. */ |
875 | preempt_disable(); | ||
876 | BUG_ON(tsk->flags & PF_DEAD); | ||
877 | tsk->flags |= PF_DEAD; | ||
878 | |||
873 | schedule(); | 879 | schedule(); |
874 | BUG(); | 880 | BUG(); |
875 | /* Avoid "noreturn function does return". */ | 881 | /* Avoid "noreturn function does return". */ |
@@ -1203,7 +1209,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, | |||
1203 | 1209 | ||
1204 | exit_code = p->exit_code; | 1210 | exit_code = p->exit_code; |
1205 | if (unlikely(!exit_code) || | 1211 | if (unlikely(!exit_code) || |
1206 | unlikely(p->state > TASK_STOPPED)) | 1212 | unlikely(p->state & TASK_TRACED)) |
1207 | goto bail_ref; | 1213 | goto bail_ref; |
1208 | return wait_noreap_copyout(p, pid, uid, | 1214 | return wait_noreap_copyout(p, pid, uid, |
1209 | why, (exit_code << 8) | 0x7f, | 1215 | why, (exit_code << 8) | 0x7f, |
@@ -1379,6 +1385,15 @@ repeat: | |||
1379 | 1385 | ||
1380 | switch (p->state) { | 1386 | switch (p->state) { |
1381 | case TASK_TRACED: | 1387 | case TASK_TRACED: |
1388 | /* | ||
1389 | * When we hit the race with PTRACE_ATTACH, | ||
1390 | * we will not report this child. But the | ||
1391 | * race means it has not yet been moved to | ||
1392 | * our ptrace_children list, so we need to | ||
1393 | * set the flag here to avoid a spurious ECHILD | ||
1394 | * when the race happens with the only child. | ||
1395 | */ | ||
1396 | flag = 1; | ||
1382 | if (!my_ptrace_child(p)) | 1397 | if (!my_ptrace_child(p)) |
1383 | continue; | 1398 | continue; |
1384 | /*FALLTHROUGH*/ | 1399 | /*FALLTHROUGH*/ |
diff --git a/kernel/fork.c b/kernel/fork.c index 533ce27f4b2c..fb8572a42297 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -42,6 +42,7 @@ | |||
42 | #include <linux/profile.h> | 42 | #include <linux/profile.h> |
43 | #include <linux/rmap.h> | 43 | #include <linux/rmap.h> |
44 | #include <linux/acct.h> | 44 | #include <linux/acct.h> |
45 | #include <linux/cn_proc.h> | ||
45 | 46 | ||
46 | #include <asm/pgtable.h> | 47 | #include <asm/pgtable.h> |
47 | #include <asm/pgalloc.h> | 48 | #include <asm/pgalloc.h> |
@@ -170,10 +171,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
170 | return NULL; | 171 | return NULL; |
171 | } | 172 | } |
172 | 173 | ||
173 | *ti = *orig->thread_info; | ||
174 | *tsk = *orig; | 174 | *tsk = *orig; |
175 | tsk->thread_info = ti; | 175 | tsk->thread_info = ti; |
176 | ti->task = tsk; | 176 | setup_thread_stack(tsk, orig); |
177 | 177 | ||
178 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 178 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
179 | atomic_set(&tsk->usage,2); | 179 | atomic_set(&tsk->usage,2); |
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
182 | } | 182 | } |
183 | 183 | ||
184 | #ifdef CONFIG_MMU | 184 | #ifdef CONFIG_MMU |
185 | static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | 185 | static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) |
186 | { | 186 | { |
187 | struct vm_area_struct * mpnt, *tmp, **pprev; | 187 | struct vm_area_struct *mpnt, *tmp, **pprev; |
188 | struct rb_node **rb_link, *rb_parent; | 188 | struct rb_node **rb_link, *rb_parent; |
189 | int retval; | 189 | int retval; |
190 | unsigned long charge; | 190 | unsigned long charge; |
191 | struct mempolicy *pol; | 191 | struct mempolicy *pol; |
192 | 192 | ||
193 | down_write(&oldmm->mmap_sem); | 193 | down_write(&oldmm->mmap_sem); |
194 | flush_cache_mm(current->mm); | 194 | flush_cache_mm(oldmm); |
195 | down_write(&mm->mmap_sem); | ||
196 | |||
195 | mm->locked_vm = 0; | 197 | mm->locked_vm = 0; |
196 | mm->mmap = NULL; | 198 | mm->mmap = NULL; |
197 | mm->mmap_cache = NULL; | 199 | mm->mmap_cache = NULL; |
198 | mm->free_area_cache = oldmm->mmap_base; | 200 | mm->free_area_cache = oldmm->mmap_base; |
199 | mm->cached_hole_size = ~0UL; | 201 | mm->cached_hole_size = ~0UL; |
200 | mm->map_count = 0; | 202 | mm->map_count = 0; |
201 | set_mm_counter(mm, rss, 0); | ||
202 | set_mm_counter(mm, anon_rss, 0); | ||
203 | cpus_clear(mm->cpu_vm_mask); | 203 | cpus_clear(mm->cpu_vm_mask); |
204 | mm->mm_rb = RB_ROOT; | 204 | mm->mm_rb = RB_ROOT; |
205 | rb_link = &mm->mm_rb.rb_node; | 205 | rb_link = &mm->mm_rb.rb_node; |
206 | rb_parent = NULL; | 206 | rb_parent = NULL; |
207 | pprev = &mm->mmap; | 207 | pprev = &mm->mmap; |
208 | 208 | ||
209 | for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { | 209 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
210 | struct file *file; | 210 | struct file *file; |
211 | 211 | ||
212 | if (mpnt->vm_flags & VM_DONTCOPY) { | 212 | if (mpnt->vm_flags & VM_DONTCOPY) { |
213 | long pages = vma_pages(mpnt); | 213 | long pages = vma_pages(mpnt); |
214 | mm->total_vm -= pages; | 214 | mm->total_vm -= pages; |
215 | __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, | 215 | vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, |
216 | -pages); | 216 | -pages); |
217 | continue; | 217 | continue; |
218 | } | 218 | } |
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
253 | } | 253 | } |
254 | 254 | ||
255 | /* | 255 | /* |
256 | * Link in the new vma and copy the page table entries: | 256 | * Link in the new vma and copy the page table entries. |
257 | * link in first so that swapoff can see swap entries. | ||
258 | * Note that, exceptionally, here the vma is inserted | ||
259 | * without holding mm->mmap_sem. | ||
260 | */ | 257 | */ |
261 | spin_lock(&mm->page_table_lock); | ||
262 | *pprev = tmp; | 258 | *pprev = tmp; |
263 | pprev = &tmp->vm_next; | 259 | pprev = &tmp->vm_next; |
264 | 260 | ||
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
267 | rb_parent = &tmp->vm_rb; | 263 | rb_parent = &tmp->vm_rb; |
268 | 264 | ||
269 | mm->map_count++; | 265 | mm->map_count++; |
270 | retval = copy_page_range(mm, current->mm, tmp); | 266 | retval = copy_page_range(mm, oldmm, mpnt); |
271 | spin_unlock(&mm->page_table_lock); | ||
272 | 267 | ||
273 | if (tmp->vm_ops && tmp->vm_ops->open) | 268 | if (tmp->vm_ops && tmp->vm_ops->open) |
274 | tmp->vm_ops->open(tmp); | 269 | tmp->vm_ops->open(tmp); |
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) | |||
277 | goto out; | 272 | goto out; |
278 | } | 273 | } |
279 | retval = 0; | 274 | retval = 0; |
280 | |||
281 | out: | 275 | out: |
282 | flush_tlb_mm(current->mm); | 276 | up_write(&mm->mmap_sem); |
277 | flush_tlb_mm(oldmm); | ||
283 | up_write(&oldmm->mmap_sem); | 278 | up_write(&oldmm->mmap_sem); |
284 | return retval; | 279 | return retval; |
285 | fail_nomem_policy: | 280 | fail_nomem_policy: |
@@ -323,10 +318,11 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
323 | INIT_LIST_HEAD(&mm->mmlist); | 318 | INIT_LIST_HEAD(&mm->mmlist); |
324 | mm->core_waiters = 0; | 319 | mm->core_waiters = 0; |
325 | mm->nr_ptes = 0; | 320 | mm->nr_ptes = 0; |
321 | set_mm_counter(mm, file_rss, 0); | ||
322 | set_mm_counter(mm, anon_rss, 0); | ||
326 | spin_lock_init(&mm->page_table_lock); | 323 | spin_lock_init(&mm->page_table_lock); |
327 | rwlock_init(&mm->ioctx_list_lock); | 324 | rwlock_init(&mm->ioctx_list_lock); |
328 | mm->ioctx_list = NULL; | 325 | mm->ioctx_list = NULL; |
329 | mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); | ||
330 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 326 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
331 | mm->cached_hole_size = ~0UL; | 327 | mm->cached_hole_size = ~0UL; |
332 | 328 | ||
@@ -472,13 +468,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
472 | if (clone_flags & CLONE_VM) { | 468 | if (clone_flags & CLONE_VM) { |
473 | atomic_inc(&oldmm->mm_users); | 469 | atomic_inc(&oldmm->mm_users); |
474 | mm = oldmm; | 470 | mm = oldmm; |
475 | /* | ||
476 | * There are cases where the PTL is held to ensure no | ||
477 | * new threads start up in user mode using an mm, which | ||
478 | * allows optimizing out ipis; the tlb_gather_mmu code | ||
479 | * is an example. | ||
480 | */ | ||
481 | spin_unlock_wait(&oldmm->page_table_lock); | ||
482 | goto good_mm; | 471 | goto good_mm; |
483 | } | 472 | } |
484 | 473 | ||
@@ -499,7 +488,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
499 | if (retval) | 488 | if (retval) |
500 | goto free_pt; | 489 | goto free_pt; |
501 | 490 | ||
502 | mm->hiwater_rss = get_mm_counter(mm,rss); | 491 | mm->hiwater_rss = get_mm_rss(mm); |
503 | mm->hiwater_vm = mm->total_vm; | 492 | mm->hiwater_vm = mm->total_vm; |
504 | 493 | ||
505 | good_mm: | 494 | good_mm: |
@@ -848,7 +837,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | |||
848 | { | 837 | { |
849 | unsigned long new_flags = p->flags; | 838 | unsigned long new_flags = p->flags; |
850 | 839 | ||
851 | new_flags &= ~PF_SUPERPRIV; | 840 | new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE); |
852 | new_flags |= PF_FORKNOEXEC; | 841 | new_flags |= PF_FORKNOEXEC; |
853 | if (!(clone_flags & CLONE_PTRACE)) | 842 | if (!(clone_flags & CLONE_PTRACE)) |
854 | p->ptrace = 0; | 843 | p->ptrace = 0; |
@@ -928,7 +917,7 @@ static task_t *copy_process(unsigned long clone_flags, | |||
928 | if (nr_threads >= max_threads) | 917 | if (nr_threads >= max_threads) |
929 | goto bad_fork_cleanup_count; | 918 | goto bad_fork_cleanup_count; |
930 | 919 | ||
931 | if (!try_module_get(p->thread_info->exec_domain->module)) | 920 | if (!try_module_get(task_thread_info(p)->exec_domain->module)) |
932 | goto bad_fork_cleanup_count; | 921 | goto bad_fork_cleanup_count; |
933 | 922 | ||
934 | if (p->binfmt && !try_module_get(p->binfmt->module)) | 923 | if (p->binfmt && !try_module_get(p->binfmt->module)) |
@@ -1135,8 +1124,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1135 | if (unlikely(p->ptrace & PT_PTRACED)) | 1124 | if (unlikely(p->ptrace & PT_PTRACED)) |
1136 | __ptrace_link(p, current->parent); | 1125 | __ptrace_link(p, current->parent); |
1137 | 1126 | ||
1138 | cpuset_fork(p); | ||
1139 | |||
1140 | attach_pid(p, PIDTYPE_PID, p->pid); | 1127 | attach_pid(p, PIDTYPE_PID, p->pid); |
1141 | attach_pid(p, PIDTYPE_TGID, p->tgid); | 1128 | attach_pid(p, PIDTYPE_TGID, p->tgid); |
1142 | if (thread_group_leader(p)) { | 1129 | if (thread_group_leader(p)) { |
@@ -1152,6 +1139,8 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1152 | nr_threads++; | 1139 | nr_threads++; |
1153 | total_forks++; | 1140 | total_forks++; |
1154 | write_unlock_irq(&tasklist_lock); | 1141 | write_unlock_irq(&tasklist_lock); |
1142 | proc_fork_connector(p); | ||
1143 | cpuset_fork(p); | ||
1155 | retval = 0; | 1144 | retval = 0; |
1156 | 1145 | ||
1157 | fork_out: | 1146 | fork_out: |
@@ -1188,7 +1177,7 @@ bad_fork_cleanup: | |||
1188 | if (p->binfmt) | 1177 | if (p->binfmt) |
1189 | module_put(p->binfmt->module); | 1178 | module_put(p->binfmt->module); |
1190 | bad_fork_cleanup_put_domain: | 1179 | bad_fork_cleanup_put_domain: |
1191 | module_put(p->thread_info->exec_domain->module); | 1180 | module_put(task_thread_info(p)->exec_domain->module); |
1192 | bad_fork_cleanup_count: | 1181 | bad_fork_cleanup_count: |
1193 | put_group_info(p->group_info); | 1182 | put_group_info(p->group_info); |
1194 | atomic_dec(&p->user->processes); | 1183 | atomic_dec(&p->user->processes); |
diff --git a/kernel/futex.c b/kernel/futex.c index ca05fe6a70b2..5872e3507f35 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -201,23 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
201 | * from swap. But that's a lot of code to duplicate here | 201 | * from swap. But that's a lot of code to duplicate here |
202 | * for a rare case, so we simply fetch the page. | 202 | * for a rare case, so we simply fetch the page. |
203 | */ | 203 | */ |
204 | |||
205 | /* | ||
206 | * Do a quick atomic lookup first - this is the fastpath. | ||
207 | */ | ||
208 | spin_lock(¤t->mm->page_table_lock); | ||
209 | page = follow_page(mm, uaddr, 0); | ||
210 | if (likely(page != NULL)) { | ||
211 | key->shared.pgoff = | ||
212 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
213 | spin_unlock(¤t->mm->page_table_lock); | ||
214 | return 0; | ||
215 | } | ||
216 | spin_unlock(¤t->mm->page_table_lock); | ||
217 | |||
218 | /* | ||
219 | * Do it the general way. | ||
220 | */ | ||
221 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 204 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); |
222 | if (err >= 0) { | 205 | if (err >= 0) { |
223 | key->shared.pgoff = | 206 | key->shared.pgoff = |
@@ -367,6 +350,11 @@ retry: | |||
367 | if (bh1 != bh2) | 350 | if (bh1 != bh2) |
368 | spin_unlock(&bh2->lock); | 351 | spin_unlock(&bh2->lock); |
369 | 352 | ||
353 | if (unlikely(op_ret != -EFAULT)) { | ||
354 | ret = op_ret; | ||
355 | goto out; | ||
356 | } | ||
357 | |||
370 | /* futex_atomic_op_inuser needs to both read and write | 358 | /* futex_atomic_op_inuser needs to both read and write |
371 | * *(int __user *)uaddr2, but we can't modify it | 359 | * *(int __user *)uaddr2, but we can't modify it |
372 | * non-atomically. Therefore, if get_user below is not | 360 | * non-atomically. Therefore, if get_user below is not |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3ff7b925c387..51df337b37db 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
117 | /* | 117 | /* |
118 | * No locking required for CPU-local interrupts: | 118 | * No locking required for CPU-local interrupts: |
119 | */ | 119 | */ |
120 | desc->handler->ack(irq); | 120 | if (desc->handler->ack) |
121 | desc->handler->ack(irq); | ||
121 | action_ret = handle_IRQ_event(irq, regs, desc->action); | 122 | action_ret = handle_IRQ_event(irq, regs, desc->action); |
122 | desc->handler->end(irq); | 123 | desc->handler->end(irq); |
123 | return 1; | 124 | return 1; |
124 | } | 125 | } |
125 | 126 | ||
126 | spin_lock(&desc->lock); | 127 | spin_lock(&desc->lock); |
127 | desc->handler->ack(irq); | 128 | if (desc->handler->ack) |
129 | desc->handler->ack(irq); | ||
128 | /* | 130 | /* |
129 | * REPLAY is when Linux resends an IRQ that was dropped earlier | 131 | * REPLAY is when Linux resends an IRQ that was dropped earlier |
130 | * WAITING is used by probe to mark irqs that are being tested | 132 | * WAITING is used by probe to mark irqs that are being tested |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1cfdb08ddf20..81c49a4d679e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | |||
24 | 24 | ||
25 | /** | 25 | /** |
26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
27 | * @irq: interrupt number to wait for | ||
27 | * | 28 | * |
28 | * This function waits for any pending IRQ handlers for this interrupt | 29 | * This function waits for any pending IRQ handlers for this interrupt |
29 | * to complete before returning. If you use this function while | 30 | * to complete before returning. If you use this function while |
@@ -35,6 +36,9 @@ void synchronize_irq(unsigned int irq) | |||
35 | { | 36 | { |
36 | struct irq_desc *desc = irq_desc + irq; | 37 | struct irq_desc *desc = irq_desc + irq; |
37 | 38 | ||
39 | if (irq >= NR_IRQS) | ||
40 | return; | ||
41 | |||
38 | while (desc->status & IRQ_INPROGRESS) | 42 | while (desc->status & IRQ_INPROGRESS) |
39 | cpu_relax(); | 43 | cpu_relax(); |
40 | } | 44 | } |
@@ -59,6 +63,9 @@ void disable_irq_nosync(unsigned int irq) | |||
59 | irq_desc_t *desc = irq_desc + irq; | 63 | irq_desc_t *desc = irq_desc + irq; |
60 | unsigned long flags; | 64 | unsigned long flags; |
61 | 65 | ||
66 | if (irq >= NR_IRQS) | ||
67 | return; | ||
68 | |||
62 | spin_lock_irqsave(&desc->lock, flags); | 69 | spin_lock_irqsave(&desc->lock, flags); |
63 | if (!desc->depth++) { | 70 | if (!desc->depth++) { |
64 | desc->status |= IRQ_DISABLED; | 71 | desc->status |= IRQ_DISABLED; |
@@ -85,6 +92,9 @@ void disable_irq(unsigned int irq) | |||
85 | { | 92 | { |
86 | irq_desc_t *desc = irq_desc + irq; | 93 | irq_desc_t *desc = irq_desc + irq; |
87 | 94 | ||
95 | if (irq >= NR_IRQS) | ||
96 | return; | ||
97 | |||
88 | disable_irq_nosync(irq); | 98 | disable_irq_nosync(irq); |
89 | if (desc->action) | 99 | if (desc->action) |
90 | synchronize_irq(irq); | 100 | synchronize_irq(irq); |
@@ -107,6 +117,9 @@ void enable_irq(unsigned int irq) | |||
107 | irq_desc_t *desc = irq_desc + irq; | 117 | irq_desc_t *desc = irq_desc + irq; |
108 | unsigned long flags; | 118 | unsigned long flags; |
109 | 119 | ||
120 | if (irq >= NR_IRQS) | ||
121 | return; | ||
122 | |||
110 | spin_lock_irqsave(&desc->lock, flags); | 123 | spin_lock_irqsave(&desc->lock, flags); |
111 | switch (desc->depth) { | 124 | switch (desc->depth) { |
112 | case 0: | 125 | case 0: |
@@ -162,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new) | |||
162 | unsigned long flags; | 175 | unsigned long flags; |
163 | int shared = 0; | 176 | int shared = 0; |
164 | 177 | ||
178 | if (irq >= NR_IRQS) | ||
179 | return -EINVAL; | ||
180 | |||
165 | if (desc->handler == &no_irq_type) | 181 | if (desc->handler == &no_irq_type) |
166 | return -ENOSYS; | 182 | return -ENOSYS; |
167 | /* | 183 | /* |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 13bcec151b57..39277dd6bf90 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/err.h> | 19 | #include <linux/err.h> |
20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
21 | #include <linux/sched.h> /* for cond_resched */ | ||
21 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
22 | 23 | ||
23 | #include <asm/sections.h> | 24 | #include <asm/sections.h> |
diff --git a/kernel/kexec.c b/kernel/kexec.c index cdd4dcd8fb63..2c95848fbce8 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p) | |||
90 | static int kimage_is_destination_range(struct kimage *image, | 90 | static int kimage_is_destination_range(struct kimage *image, |
91 | unsigned long start, unsigned long end); | 91 | unsigned long start, unsigned long end); |
92 | static struct page *kimage_alloc_page(struct kimage *image, | 92 | static struct page *kimage_alloc_page(struct kimage *image, |
93 | unsigned int gfp_mask, | 93 | gfp_t gfp_mask, |
94 | unsigned long dest); | 94 | unsigned long dest); |
95 | 95 | ||
96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | 96 | static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, |
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image, | |||
326 | return 0; | 326 | return 0; |
327 | } | 327 | } |
328 | 328 | ||
329 | static struct page *kimage_alloc_pages(unsigned int gfp_mask, | 329 | static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) |
330 | unsigned int order) | ||
331 | { | 330 | { |
332 | struct page *pages; | 331 | struct page *pages; |
333 | 332 | ||
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask, | |||
335 | if (pages) { | 334 | if (pages) { |
336 | unsigned int count, i; | 335 | unsigned int count, i; |
337 | pages->mapping = NULL; | 336 | pages->mapping = NULL; |
338 | pages->private = order; | 337 | set_page_private(pages, order); |
339 | count = 1 << order; | 338 | count = 1 << order; |
340 | for (i = 0; i < count; i++) | 339 | for (i = 0; i < count; i++) |
341 | SetPageReserved(pages + i); | 340 | SetPageReserved(pages + i); |
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page) | |||
348 | { | 347 | { |
349 | unsigned int order, count, i; | 348 | unsigned int order, count, i; |
350 | 349 | ||
351 | order = page->private; | 350 | order = page_private(page); |
352 | count = 1 << order; | 351 | count = 1 << order; |
353 | for (i = 0; i < count; i++) | 352 | for (i = 0; i < count; i++) |
354 | ClearPageReserved(page + i); | 353 | ClearPageReserved(page + i); |
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image, | |||
654 | } | 653 | } |
655 | 654 | ||
656 | static struct page *kimage_alloc_page(struct kimage *image, | 655 | static struct page *kimage_alloc_page(struct kimage *image, |
657 | unsigned int gfp_mask, | 656 | gfp_t gfp_mask, |
658 | unsigned long destination) | 657 | unsigned long destination) |
659 | { | 658 | { |
660 | /* | 659 | /* |
diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 179baafcdd96..64ab045c3d9d 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c | |||
@@ -36,7 +36,7 @@ | |||
36 | * struct kfifo with kfree(). | 36 | * struct kfifo with kfree(). |
37 | */ | 37 | */ |
38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, | 38 | struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, |
39 | unsigned int __nocast gfp_mask, spinlock_t *lock) | 39 | gfp_t gfp_mask, spinlock_t *lock) |
40 | { | 40 | { |
41 | struct kfifo *fifo; | 41 | struct kfifo *fifo; |
42 | 42 | ||
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init); | |||
64 | * | 64 | * |
65 | * The size will be rounded-up to a power of 2. | 65 | * The size will be rounded-up to a power of 2. |
66 | */ | 66 | */ |
67 | struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) | 67 | struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) |
68 | { | 68 | { |
69 | unsigned char *buffer; | 69 | unsigned char *buffer; |
70 | struct kfifo *ret; | 70 | struct kfifo *ret; |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 44166e3bb8af..51a892063aaa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -131,14 +131,14 @@ struct subprocess_info { | |||
131 | static int ____call_usermodehelper(void *data) | 131 | static int ____call_usermodehelper(void *data) |
132 | { | 132 | { |
133 | struct subprocess_info *sub_info = data; | 133 | struct subprocess_info *sub_info = data; |
134 | struct key *old_session; | 134 | struct key *new_session, *old_session; |
135 | int retval; | 135 | int retval; |
136 | 136 | ||
137 | /* Unblock all signals and set the session keyring. */ | 137 | /* Unblock all signals and set the session keyring. */ |
138 | key_get(sub_info->ring); | 138 | new_session = key_get(sub_info->ring); |
139 | flush_signals(current); | 139 | flush_signals(current); |
140 | spin_lock_irq(¤t->sighand->siglock); | 140 | spin_lock_irq(¤t->sighand->siglock); |
141 | old_session = __install_session_keyring(current, sub_info->ring); | 141 | old_session = __install_session_keyring(current, new_session); |
142 | flush_signal_handlers(current, 1); | 142 | flush_signal_handlers(current, 1); |
143 | sigemptyset(¤t->blocked); | 143 | sigemptyset(¤t->blocked); |
144 | recalc_sigpending(); | 144 | recalc_sigpending(); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f3ea492ab44d..5beda378cc75 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -32,9 +32,9 @@ | |||
32 | * <prasanna@in.ibm.com> added function-return probes. | 32 | * <prasanna@in.ibm.com> added function-return probes. |
33 | */ | 33 | */ |
34 | #include <linux/kprobes.h> | 34 | #include <linux/kprobes.h> |
35 | #include <linux/spinlock.h> | ||
36 | #include <linux/hash.h> | 35 | #include <linux/hash.h> |
37 | #include <linux/init.h> | 36 | #include <linux/init.h> |
37 | #include <linux/slab.h> | ||
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
40 | #include <asm-generic/sections.h> | 40 | #include <asm-generic/sections.h> |
@@ -48,9 +48,9 @@ | |||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | 50 | ||
51 | unsigned int kprobe_cpu = NR_CPUS; | 51 | static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */ |
52 | static DEFINE_SPINLOCK(kprobe_lock); | 52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static struct kprobe *curr_kprobe; | 53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 54 | ||
55 | /* | 55 | /* |
56 | * kprobe->ainsn.insn points to the copy of the instruction to be | 56 | * kprobe->ainsn.insn points to the copy of the instruction to be |
@@ -152,50 +152,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) | |||
152 | } | 152 | } |
153 | } | 153 | } |
154 | 154 | ||
155 | /* Locks kprobe: irqs must be disabled */ | 155 | /* We have preemption disabled.. so it is safe to use __ versions */ |
156 | void __kprobes lock_kprobes(void) | 156 | static inline void set_kprobe_instance(struct kprobe *kp) |
157 | { | 157 | { |
158 | unsigned long flags = 0; | 158 | __get_cpu_var(kprobe_instance) = kp; |
159 | |||
160 | /* Avoiding local interrupts to happen right after we take the kprobe_lock | ||
161 | * and before we get a chance to update kprobe_cpu, this to prevent | ||
162 | * deadlock when we have a kprobe on ISR routine and a kprobe on task | ||
163 | * routine | ||
164 | */ | ||
165 | local_irq_save(flags); | ||
166 | |||
167 | spin_lock(&kprobe_lock); | ||
168 | kprobe_cpu = smp_processor_id(); | ||
169 | |||
170 | local_irq_restore(flags); | ||
171 | } | 159 | } |
172 | 160 | ||
173 | void __kprobes unlock_kprobes(void) | 161 | static inline void reset_kprobe_instance(void) |
174 | { | 162 | { |
175 | unsigned long flags = 0; | 163 | __get_cpu_var(kprobe_instance) = NULL; |
176 | |||
177 | /* Avoiding local interrupts to happen right after we update | ||
178 | * kprobe_cpu and before we get a a chance to release kprobe_lock, | ||
179 | * this to prevent deadlock when we have a kprobe on ISR routine and | ||
180 | * a kprobe on task routine | ||
181 | */ | ||
182 | local_irq_save(flags); | ||
183 | |||
184 | kprobe_cpu = NR_CPUS; | ||
185 | spin_unlock(&kprobe_lock); | ||
186 | |||
187 | local_irq_restore(flags); | ||
188 | } | 164 | } |
189 | 165 | ||
190 | /* You have to be holding the kprobe_lock */ | 166 | /* |
167 | * This routine is called either: | ||
168 | * - under the kprobe_lock spinlock - during kprobe_[un]register() | ||
169 | * OR | ||
170 | * - with preemption disabled - from arch/xxx/kernel/kprobes.c | ||
171 | */ | ||
191 | struct kprobe __kprobes *get_kprobe(void *addr) | 172 | struct kprobe __kprobes *get_kprobe(void *addr) |
192 | { | 173 | { |
193 | struct hlist_head *head; | 174 | struct hlist_head *head; |
194 | struct hlist_node *node; | 175 | struct hlist_node *node; |
176 | struct kprobe *p; | ||
195 | 177 | ||
196 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; | 178 | head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; |
197 | hlist_for_each(node, head) { | 179 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
198 | struct kprobe *p = hlist_entry(node, struct kprobe, hlist); | ||
199 | if (p->addr == addr) | 180 | if (p->addr == addr) |
200 | return p; | 181 | return p; |
201 | } | 182 | } |
@@ -210,13 +191,13 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
210 | { | 191 | { |
211 | struct kprobe *kp; | 192 | struct kprobe *kp; |
212 | 193 | ||
213 | list_for_each_entry(kp, &p->list, list) { | 194 | list_for_each_entry_rcu(kp, &p->list, list) { |
214 | if (kp->pre_handler) { | 195 | if (kp->pre_handler) { |
215 | curr_kprobe = kp; | 196 | set_kprobe_instance(kp); |
216 | if (kp->pre_handler(kp, regs)) | 197 | if (kp->pre_handler(kp, regs)) |
217 | return 1; | 198 | return 1; |
218 | } | 199 | } |
219 | curr_kprobe = NULL; | 200 | reset_kprobe_instance(); |
220 | } | 201 | } |
221 | return 0; | 202 | return 0; |
222 | } | 203 | } |
@@ -226,11 +207,11 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
226 | { | 207 | { |
227 | struct kprobe *kp; | 208 | struct kprobe *kp; |
228 | 209 | ||
229 | list_for_each_entry(kp, &p->list, list) { | 210 | list_for_each_entry_rcu(kp, &p->list, list) { |
230 | if (kp->post_handler) { | 211 | if (kp->post_handler) { |
231 | curr_kprobe = kp; | 212 | set_kprobe_instance(kp); |
232 | kp->post_handler(kp, regs, flags); | 213 | kp->post_handler(kp, regs, flags); |
233 | curr_kprobe = NULL; | 214 | reset_kprobe_instance(); |
234 | } | 215 | } |
235 | } | 216 | } |
236 | return; | 217 | return; |
@@ -239,12 +220,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
239 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 220 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
240 | int trapnr) | 221 | int trapnr) |
241 | { | 222 | { |
223 | struct kprobe *cur = __get_cpu_var(kprobe_instance); | ||
224 | |||
242 | /* | 225 | /* |
243 | * if we faulted "during" the execution of a user specified | 226 | * if we faulted "during" the execution of a user specified |
244 | * probe handler, invoke just that probe's fault handler | 227 | * probe handler, invoke just that probe's fault handler |
245 | */ | 228 | */ |
246 | if (curr_kprobe && curr_kprobe->fault_handler) { | 229 | if (cur && cur->fault_handler) { |
247 | if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr)) | 230 | if (cur->fault_handler(cur, regs, trapnr)) |
248 | return 1; | 231 | return 1; |
249 | } | 232 | } |
250 | return 0; | 233 | return 0; |
@@ -252,17 +235,18 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
252 | 235 | ||
253 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 236 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
254 | { | 237 | { |
255 | struct kprobe *kp = curr_kprobe; | 238 | struct kprobe *cur = __get_cpu_var(kprobe_instance); |
256 | if (curr_kprobe && kp->break_handler) { | 239 | int ret = 0; |
257 | if (kp->break_handler(kp, regs)) { | 240 | |
258 | curr_kprobe = NULL; | 241 | if (cur && cur->break_handler) { |
259 | return 1; | 242 | if (cur->break_handler(cur, regs)) |
260 | } | 243 | ret = 1; |
261 | } | 244 | } |
262 | curr_kprobe = NULL; | 245 | reset_kprobe_instance(); |
263 | return 0; | 246 | return ret; |
264 | } | 247 | } |
265 | 248 | ||
249 | /* Called with kretprobe_lock held */ | ||
266 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) | 250 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) |
267 | { | 251 | { |
268 | struct hlist_node *node; | 252 | struct hlist_node *node; |
@@ -272,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) | |||
272 | return NULL; | 256 | return NULL; |
273 | } | 257 | } |
274 | 258 | ||
259 | /* Called with kretprobe_lock held */ | ||
275 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe | 260 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe |
276 | *rp) | 261 | *rp) |
277 | { | 262 | { |
@@ -282,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe | |||
282 | return NULL; | 267 | return NULL; |
283 | } | 268 | } |
284 | 269 | ||
270 | /* Called with kretprobe_lock held */ | ||
285 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) | 271 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) |
286 | { | 272 | { |
287 | /* | 273 | /* |
@@ -300,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) | |||
300 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | 286 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); |
301 | } | 287 | } |
302 | 288 | ||
289 | /* Called with kretprobe_lock held */ | ||
303 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) | 290 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) |
304 | { | 291 | { |
305 | /* remove rp inst off the rprobe_inst_table */ | 292 | /* remove rp inst off the rprobe_inst_table */ |
@@ -333,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk) | |||
333 | struct hlist_node *node, *tmp; | 320 | struct hlist_node *node, *tmp; |
334 | unsigned long flags = 0; | 321 | unsigned long flags = 0; |
335 | 322 | ||
336 | spin_lock_irqsave(&kprobe_lock, flags); | 323 | spin_lock_irqsave(&kretprobe_lock, flags); |
337 | head = kretprobe_inst_table_head(current); | 324 | head = kretprobe_inst_table_head(current); |
338 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { | 325 | hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { |
339 | if (ri->task == tk) | 326 | if (ri->task == tk) |
340 | recycle_rp_inst(ri); | 327 | recycle_rp_inst(ri); |
341 | } | 328 | } |
342 | spin_unlock_irqrestore(&kprobe_lock, flags); | 329 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
343 | } | 330 | } |
344 | 331 | ||
345 | /* | 332 | /* |
@@ -350,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, | |||
350 | struct pt_regs *regs) | 337 | struct pt_regs *regs) |
351 | { | 338 | { |
352 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | 339 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); |
340 | unsigned long flags = 0; | ||
353 | 341 | ||
354 | /*TODO: consider to only swap the RA after the last pre_handler fired */ | 342 | /*TODO: consider to only swap the RA after the last pre_handler fired */ |
343 | spin_lock_irqsave(&kretprobe_lock, flags); | ||
355 | arch_prepare_kretprobe(rp, regs); | 344 | arch_prepare_kretprobe(rp, regs); |
345 | spin_unlock_irqrestore(&kretprobe_lock, flags); | ||
356 | return 0; | 346 | return 0; |
357 | } | 347 | } |
358 | 348 | ||
@@ -383,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
383 | struct kprobe *kp; | 373 | struct kprobe *kp; |
384 | 374 | ||
385 | if (p->break_handler) { | 375 | if (p->break_handler) { |
386 | list_for_each_entry(kp, &old_p->list, list) { | 376 | list_for_each_entry_rcu(kp, &old_p->list, list) { |
387 | if (kp->break_handler) | 377 | if (kp->break_handler) |
388 | return -EEXIST; | 378 | return -EEXIST; |
389 | } | 379 | } |
390 | list_add_tail(&p->list, &old_p->list); | 380 | list_add_tail_rcu(&p->list, &old_p->list); |
391 | } else | 381 | } else |
392 | list_add(&p->list, &old_p->list); | 382 | list_add_rcu(&p->list, &old_p->list); |
393 | return 0; | 383 | return 0; |
394 | } | 384 | } |
395 | 385 | ||
@@ -407,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
407 | ap->break_handler = aggr_break_handler; | 397 | ap->break_handler = aggr_break_handler; |
408 | 398 | ||
409 | INIT_LIST_HEAD(&ap->list); | 399 | INIT_LIST_HEAD(&ap->list); |
410 | list_add(&p->list, &ap->list); | 400 | list_add_rcu(&p->list, &ap->list); |
411 | 401 | ||
412 | INIT_HLIST_NODE(&ap->hlist); | 402 | INIT_HLIST_NODE(&ap->hlist); |
413 | hlist_del(&p->hlist); | 403 | hlist_del_rcu(&p->hlist); |
414 | hlist_add_head(&ap->hlist, | 404 | hlist_add_head_rcu(&ap->hlist, |
415 | &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); | 405 | &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); |
416 | } | 406 | } |
417 | 407 | ||
418 | /* | 408 | /* |
419 | * This is the second or subsequent kprobe at the address - handle | 409 | * This is the second or subsequent kprobe at the address - handle |
420 | * the intricacies | 410 | * the intricacies |
421 | * TODO: Move kcalloc outside the spinlock | 411 | * TODO: Move kcalloc outside the spin_lock |
422 | */ | 412 | */ |
423 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | 413 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
424 | struct kprobe *p) | 414 | struct kprobe *p) |
@@ -444,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | |||
444 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | 434 | static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) |
445 | { | 435 | { |
446 | arch_disarm_kprobe(p); | 436 | arch_disarm_kprobe(p); |
447 | hlist_del(&p->hlist); | 437 | hlist_del_rcu(&p->hlist); |
448 | spin_unlock_irqrestore(&kprobe_lock, flags); | 438 | spin_unlock_irqrestore(&kprobe_lock, flags); |
449 | arch_remove_kprobe(p); | 439 | arch_remove_kprobe(p); |
450 | } | 440 | } |
@@ -452,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) | |||
452 | static inline void cleanup_aggr_kprobe(struct kprobe *old_p, | 442 | static inline void cleanup_aggr_kprobe(struct kprobe *old_p, |
453 | struct kprobe *p, unsigned long flags) | 443 | struct kprobe *p, unsigned long flags) |
454 | { | 444 | { |
455 | list_del(&p->list); | 445 | list_del_rcu(&p->list); |
456 | if (list_empty(&old_p->list)) { | 446 | if (list_empty(&old_p->list)) |
457 | cleanup_kprobe(old_p, flags); | 447 | cleanup_kprobe(old_p, flags); |
458 | kfree(old_p); | 448 | else |
459 | } else | ||
460 | spin_unlock_irqrestore(&kprobe_lock, flags); | 449 | spin_unlock_irqrestore(&kprobe_lock, flags); |
461 | } | 450 | } |
462 | 451 | ||
@@ -479,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
479 | if ((ret = arch_prepare_kprobe(p)) != 0) | 468 | if ((ret = arch_prepare_kprobe(p)) != 0) |
480 | goto rm_kprobe; | 469 | goto rm_kprobe; |
481 | 470 | ||
471 | p->nmissed = 0; | ||
482 | spin_lock_irqsave(&kprobe_lock, flags); | 472 | spin_lock_irqsave(&kprobe_lock, flags); |
483 | old_p = get_kprobe(p->addr); | 473 | old_p = get_kprobe(p->addr); |
484 | p->nmissed = 0; | ||
485 | if (old_p) { | 474 | if (old_p) { |
486 | ret = register_aggr_kprobe(old_p, p); | 475 | ret = register_aggr_kprobe(old_p, p); |
487 | goto out; | 476 | goto out; |
@@ -489,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
489 | 478 | ||
490 | arch_copy_kprobe(p); | 479 | arch_copy_kprobe(p); |
491 | INIT_HLIST_NODE(&p->hlist); | 480 | INIT_HLIST_NODE(&p->hlist); |
492 | hlist_add_head(&p->hlist, | 481 | hlist_add_head_rcu(&p->hlist, |
493 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 482 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
494 | 483 | ||
495 | arch_arm_kprobe(p); | 484 | arch_arm_kprobe(p); |
@@ -510,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p) | |||
510 | spin_lock_irqsave(&kprobe_lock, flags); | 499 | spin_lock_irqsave(&kprobe_lock, flags); |
511 | old_p = get_kprobe(p->addr); | 500 | old_p = get_kprobe(p->addr); |
512 | if (old_p) { | 501 | if (old_p) { |
502 | /* cleanup_*_kprobe() does the spin_unlock_irqrestore */ | ||
513 | if (old_p->pre_handler == aggr_pre_handler) | 503 | if (old_p->pre_handler == aggr_pre_handler) |
514 | cleanup_aggr_kprobe(old_p, p, flags); | 504 | cleanup_aggr_kprobe(old_p, p, flags); |
515 | else | 505 | else |
516 | cleanup_kprobe(p, flags); | 506 | cleanup_kprobe(p, flags); |
507 | |||
508 | synchronize_sched(); | ||
509 | if (old_p->pre_handler == aggr_pre_handler && | ||
510 | list_empty(&old_p->list)) | ||
511 | kfree(old_p); | ||
517 | } else | 512 | } else |
518 | spin_unlock_irqrestore(&kprobe_lock, flags); | 513 | spin_unlock_irqrestore(&kprobe_lock, flags); |
519 | } | 514 | } |
@@ -590,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp) | |||
590 | 585 | ||
591 | unregister_kprobe(&rp->kp); | 586 | unregister_kprobe(&rp->kp); |
592 | /* No race here */ | 587 | /* No race here */ |
593 | spin_lock_irqsave(&kprobe_lock, flags); | 588 | spin_lock_irqsave(&kretprobe_lock, flags); |
594 | free_rp_inst(rp); | 589 | free_rp_inst(rp); |
595 | while ((ri = get_used_rp_inst(rp)) != NULL) { | 590 | while ((ri = get_used_rp_inst(rp)) != NULL) { |
596 | ri->rp = NULL; | 591 | ri->rp = NULL; |
597 | hlist_del(&ri->uflist); | 592 | hlist_del(&ri->uflist); |
598 | } | 593 | } |
599 | spin_unlock_irqrestore(&kprobe_lock, flags); | 594 | spin_unlock_irqrestore(&kretprobe_lock, flags); |
600 | } | 595 | } |
601 | 596 | ||
602 | static int __init init_kprobes(void) | 597 | static int __init init_kprobes(void) |
diff --git a/kernel/kthread.c b/kernel/kthread.c index f50f174e92da..e75950a1092c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind); | |||
165 | 165 | ||
166 | int kthread_stop(struct task_struct *k) | 166 | int kthread_stop(struct task_struct *k) |
167 | { | 167 | { |
168 | return kthread_stop_sem(k, NULL); | ||
169 | } | ||
170 | EXPORT_SYMBOL(kthread_stop); | ||
171 | |||
172 | int kthread_stop_sem(struct task_struct *k, struct semaphore *s) | ||
173 | { | ||
168 | int ret; | 174 | int ret; |
169 | 175 | ||
170 | down(&kthread_stop_lock); | 176 | down(&kthread_stop_lock); |
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k) | |||
178 | 184 | ||
179 | /* Now set kthread_should_stop() to true, and wake it up. */ | 185 | /* Now set kthread_should_stop() to true, and wake it up. */ |
180 | kthread_stop_info.k = k; | 186 | kthread_stop_info.k = k; |
181 | wake_up_process(k); | 187 | if (s) |
188 | up(s); | ||
189 | else | ||
190 | wake_up_process(k); | ||
182 | put_task_struct(k); | 191 | put_task_struct(k); |
183 | 192 | ||
184 | /* Once it dies, reset stop ptr, gather result and we're done. */ | 193 | /* Once it dies, reset stop ptr, gather result and we're done. */ |
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k) | |||
189 | 198 | ||
190 | return ret; | 199 | return ret; |
191 | } | 200 | } |
192 | EXPORT_SYMBOL(kthread_stop); | 201 | EXPORT_SYMBOL(kthread_stop_sem); |
193 | 202 | ||
194 | static __init int helper_init(void) | 203 | static __init int helper_init(void) |
195 | { | 204 | { |
diff --git a/kernel/module.c b/kernel/module.c index ff5c500ab625..2ea929d51ad0 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/stop_machine.h> | 37 | #include <linux/stop_machine.h> |
38 | #include <linux/device.h> | 38 | #include <linux/device.h> |
39 | #include <linux/string.h> | 39 | #include <linux/string.h> |
40 | #include <linux/sched.h> | ||
40 | #include <asm/uaccess.h> | 41 | #include <asm/uaccess.h> |
41 | #include <asm/semaphore.h> | 42 | #include <asm/semaphore.h> |
42 | #include <asm/cacheflush.h> | 43 | #include <asm/cacheflush.h> |
diff --git a/kernel/params.c b/kernel/params.c index fbf173215fd2..47ba69547945 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/device.h> | 24 | #include <linux/device.h> |
25 | #include <linux/err.h> | 25 | #include <linux/err.h> |
26 | #include <linux/slab.h> | ||
26 | 27 | ||
27 | #if 0 | 28 | #if 0 |
28 | #define DEBUGP printk | 29 | #define DEBUGP printk |
@@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val) | |||
80 | int in_quote = 0, quoted = 0; | 81 | int in_quote = 0, quoted = 0; |
81 | char *next; | 82 | char *next; |
82 | 83 | ||
83 | /* Chew any extra spaces */ | ||
84 | while (*args == ' ') args++; | ||
85 | if (*args == '"') { | 84 | if (*args == '"') { |
86 | args++; | 85 | args++; |
87 | in_quote = 1; | 86 | in_quote = 1; |
@@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val) | |||
121 | next = args + i + 1; | 120 | next = args + i + 1; |
122 | } else | 121 | } else |
123 | next = args + i; | 122 | next = args + i; |
123 | |||
124 | /* Chew up trailing spaces. */ | ||
125 | while (*next == ' ') | ||
126 | next++; | ||
124 | return next; | 127 | return next; |
125 | } | 128 | } |
126 | 129 | ||
@@ -135,6 +138,10 @@ int parse_args(const char *name, | |||
135 | 138 | ||
136 | DEBUGP("Parsing ARGS: %s\n", args); | 139 | DEBUGP("Parsing ARGS: %s\n", args); |
137 | 140 | ||
141 | /* Chew leading spaces */ | ||
142 | while (*args == ' ') | ||
143 | args++; | ||
144 | |||
138 | while (*args) { | 145 | while (*args) { |
139 | int ret; | 146 | int ret; |
140 | 147 | ||
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index ad85d3f0dcc4..cae4f5728997 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp) | |||
36 | union cpu_time_count ret; | 36 | union cpu_time_count ret; |
37 | ret.sched = 0; /* high half always zero when .cpu used */ | 37 | ret.sched = 0; /* high half always zero when .cpu used */ |
38 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { | 38 | if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { |
39 | ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; | 39 | ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; |
40 | } else { | 40 | } else { |
41 | ret.cpu = timespec_to_cputime(tp); | 41 | ret.cpu = timespec_to_cputime(tp); |
42 | } | 42 | } |
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock, | |||
91 | * Update expiry time from increment, and increase overrun count, | 91 | * Update expiry time from increment, and increase overrun count, |
92 | * given the current clock sample. | 92 | * given the current clock sample. |
93 | */ | 93 | */ |
94 | static inline void bump_cpu_timer(struct k_itimer *timer, | 94 | static void bump_cpu_timer(struct k_itimer *timer, |
95 | union cpu_time_count now) | 95 | union cpu_time_count now) |
96 | { | 96 | { |
97 | int i; | 97 | int i; |
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, | |||
110 | for (i = 0; incr < delta - incr; i++) | 110 | for (i = 0; incr < delta - incr; i++) |
111 | incr = incr << 1; | 111 | incr = incr << 1; |
112 | for (; i >= 0; incr >>= 1, i--) { | 112 | for (; i >= 0; incr >>= 1, i--) { |
113 | if (delta <= incr) | 113 | if (delta < incr) |
114 | continue; | 114 | continue; |
115 | timer->it.cpu.expires.sched += incr; | 115 | timer->it.cpu.expires.sched += incr; |
116 | timer->it_overrun += 1 << i; | 116 | timer->it_overrun += 1 << i; |
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer, | |||
128 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) | 128 | for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) |
129 | incr = cputime_add(incr, incr); | 129 | incr = cputime_add(incr, incr); |
130 | for (; i >= 0; incr = cputime_halve(incr), i--) { | 130 | for (; i >= 0; incr = cputime_halve(incr), i--) { |
131 | if (cputime_le(delta, incr)) | 131 | if (cputime_lt(delta, incr)) |
132 | continue; | 132 | continue; |
133 | timer->it.cpu.expires.cpu = | 133 | timer->it.cpu.expires.cpu = |
134 | cputime_add(timer->it.cpu.expires.cpu, incr); | 134 | cputime_add(timer->it.cpu.expires.cpu, incr); |
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer) | |||
380 | int posix_cpu_timer_del(struct k_itimer *timer) | 380 | int posix_cpu_timer_del(struct k_itimer *timer) |
381 | { | 381 | { |
382 | struct task_struct *p = timer->it.cpu.task; | 382 | struct task_struct *p = timer->it.cpu.task; |
383 | int ret = 0; | ||
383 | 384 | ||
384 | if (timer->it.cpu.firing) | 385 | if (likely(p != NULL)) { |
385 | return TIMER_RETRY; | ||
386 | |||
387 | if (unlikely(p == NULL)) | ||
388 | return 0; | ||
389 | |||
390 | if (!list_empty(&timer->it.cpu.entry)) { | ||
391 | read_lock(&tasklist_lock); | 386 | read_lock(&tasklist_lock); |
392 | if (unlikely(p->signal == NULL)) { | 387 | if (unlikely(p->signal == NULL)) { |
393 | /* | 388 | /* |
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer) | |||
396 | */ | 391 | */ |
397 | BUG_ON(!list_empty(&timer->it.cpu.entry)); | 392 | BUG_ON(!list_empty(&timer->it.cpu.entry)); |
398 | } else { | 393 | } else { |
399 | /* | ||
400 | * Take us off the task's timer list. | ||
401 | */ | ||
402 | spin_lock(&p->sighand->siglock); | 394 | spin_lock(&p->sighand->siglock); |
403 | list_del(&timer->it.cpu.entry); | 395 | if (timer->it.cpu.firing) |
396 | ret = TIMER_RETRY; | ||
397 | else | ||
398 | list_del(&timer->it.cpu.entry); | ||
404 | spin_unlock(&p->sighand->siglock); | 399 | spin_unlock(&p->sighand->siglock); |
405 | } | 400 | } |
406 | read_unlock(&tasklist_lock); | 401 | read_unlock(&tasklist_lock); |
402 | |||
403 | if (!ret) | ||
404 | put_task_struct(p); | ||
407 | } | 405 | } |
408 | put_task_struct(p); | ||
409 | 406 | ||
410 | return 0; | 407 | return ret; |
411 | } | 408 | } |
412 | 409 | ||
413 | /* | 410 | /* |
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head, | |||
424 | cputime_t ptime = cputime_add(utime, stime); | 421 | cputime_t ptime = cputime_add(utime, stime); |
425 | 422 | ||
426 | list_for_each_entry_safe(timer, next, head, entry) { | 423 | list_for_each_entry_safe(timer, next, head, entry) { |
427 | timer->task = NULL; | ||
428 | list_del_init(&timer->entry); | 424 | list_del_init(&timer->entry); |
429 | if (cputime_lt(timer->expires.cpu, ptime)) { | 425 | if (cputime_lt(timer->expires.cpu, ptime)) { |
430 | timer->expires.cpu = cputime_zero; | 426 | timer->expires.cpu = cputime_zero; |
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head, | |||
436 | 432 | ||
437 | ++head; | 433 | ++head; |
438 | list_for_each_entry_safe(timer, next, head, entry) { | 434 | list_for_each_entry_safe(timer, next, head, entry) { |
439 | timer->task = NULL; | ||
440 | list_del_init(&timer->entry); | 435 | list_del_init(&timer->entry); |
441 | if (cputime_lt(timer->expires.cpu, utime)) { | 436 | if (cputime_lt(timer->expires.cpu, utime)) { |
442 | timer->expires.cpu = cputime_zero; | 437 | timer->expires.cpu = cputime_zero; |
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head, | |||
448 | 443 | ||
449 | ++head; | 444 | ++head; |
450 | list_for_each_entry_safe(timer, next, head, entry) { | 445 | list_for_each_entry_safe(timer, next, head, entry) { |
451 | timer->task = NULL; | ||
452 | list_del_init(&timer->entry); | 446 | list_del_init(&timer->entry); |
453 | if (timer->expires.sched < sched_time) { | 447 | if (timer->expires.sched < sched_time) { |
454 | timer->expires.sched = 0; | 448 | timer->expires.sched = 0; |
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p, | |||
492 | struct task_struct *t = p; | 486 | struct task_struct *t = p; |
493 | unsigned int nthreads = atomic_read(&p->signal->live); | 487 | unsigned int nthreads = atomic_read(&p->signal->live); |
494 | 488 | ||
489 | if (!nthreads) | ||
490 | return; | ||
491 | |||
495 | switch (clock_idx) { | 492 | switch (clock_idx) { |
496 | default: | 493 | default: |
497 | BUG(); | 494 | BUG(); |
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
500 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 497 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), |
501 | nthreads); | 498 | nthreads); |
502 | do { | 499 | do { |
503 | if (!unlikely(t->exit_state)) { | 500 | if (likely(!(t->flags & PF_EXITING))) { |
504 | ticks = cputime_add(prof_ticks(t), left); | 501 | ticks = cputime_add(prof_ticks(t), left); |
505 | if (cputime_eq(t->it_prof_expires, | 502 | if (cputime_eq(t->it_prof_expires, |
506 | cputime_zero) || | 503 | cputime_zero) || |
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
515 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), | 512 | left = cputime_div(cputime_sub(expires.cpu, val.cpu), |
516 | nthreads); | 513 | nthreads); |
517 | do { | 514 | do { |
518 | if (!unlikely(t->exit_state)) { | 515 | if (likely(!(t->flags & PF_EXITING))) { |
519 | ticks = cputime_add(virt_ticks(t), left); | 516 | ticks = cputime_add(virt_ticks(t), left); |
520 | if (cputime_eq(t->it_virt_expires, | 517 | if (cputime_eq(t->it_virt_expires, |
521 | cputime_zero) || | 518 | cputime_zero) || |
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p, | |||
530 | nsleft = expires.sched - val.sched; | 527 | nsleft = expires.sched - val.sched; |
531 | do_div(nsleft, nthreads); | 528 | do_div(nsleft, nthreads); |
532 | do { | 529 | do { |
533 | if (!unlikely(t->exit_state)) { | 530 | if (likely(!(t->flags & PF_EXITING))) { |
534 | ns = t->sched_time + nsleft; | 531 | ns = t->sched_time + nsleft; |
535 | if (t->it_sched_expires == 0 || | 532 | if (t->it_sched_expires == 0 || |
536 | t->it_sched_expires > ns) { | 533 | t->it_sched_expires > ns) { |
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
569 | struct cpu_timer_list *next; | 566 | struct cpu_timer_list *next; |
570 | unsigned long i; | 567 | unsigned long i; |
571 | 568 | ||
569 | if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) | ||
570 | return; | ||
571 | |||
572 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? | 572 | head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? |
573 | p->cpu_timers : p->signal->cpu_timers); | 573 | p->cpu_timers : p->signal->cpu_timers); |
574 | head += CPUCLOCK_WHICH(timer->it_clock); | 574 | head += CPUCLOCK_WHICH(timer->it_clock); |
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) | |||
579 | listpos = head; | 579 | listpos = head; |
580 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { | 580 | if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { |
581 | list_for_each_entry(next, head, entry) { | 581 | list_for_each_entry(next, head, entry) { |
582 | if (next->expires.sched > nt->expires.sched) { | 582 | if (next->expires.sched > nt->expires.sched) |
583 | listpos = &next->entry; | ||
584 | break; | 583 | break; |
585 | } | 584 | listpos = &next->entry; |
586 | } | 585 | } |
587 | } else { | 586 | } else { |
588 | list_for_each_entry(next, head, entry) { | 587 | list_for_each_entry(next, head, entry) { |
589 | if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { | 588 | if (cputime_gt(next->expires.cpu, nt->expires.cpu)) |
590 | listpos = &next->entry; | ||
591 | break; | 589 | break; |
592 | } | 590 | listpos = &next->entry; |
593 | } | 591 | } |
594 | } | 592 | } |
595 | list_add(&nt->entry, listpos); | 593 | list_add(&nt->entry, listpos); |
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
733 | * Disarm any old timer after extracting its expiry time. | 731 | * Disarm any old timer after extracting its expiry time. |
734 | */ | 732 | */ |
735 | BUG_ON(!irqs_disabled()); | 733 | BUG_ON(!irqs_disabled()); |
734 | |||
735 | ret = 0; | ||
736 | spin_lock(&p->sighand->siglock); | 736 | spin_lock(&p->sighand->siglock); |
737 | old_expires = timer->it.cpu.expires; | 737 | old_expires = timer->it.cpu.expires; |
738 | list_del_init(&timer->it.cpu.entry); | 738 | if (unlikely(timer->it.cpu.firing)) { |
739 | timer->it.cpu.firing = -1; | ||
740 | ret = TIMER_RETRY; | ||
741 | } else | ||
742 | list_del_init(&timer->it.cpu.entry); | ||
739 | spin_unlock(&p->sighand->siglock); | 743 | spin_unlock(&p->sighand->siglock); |
740 | 744 | ||
741 | /* | 745 | /* |
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
783 | } | 787 | } |
784 | } | 788 | } |
785 | 789 | ||
786 | if (unlikely(timer->it.cpu.firing)) { | 790 | if (unlikely(ret)) { |
787 | /* | 791 | /* |
788 | * We are colliding with the timer actually firing. | 792 | * We are colliding with the timer actually firing. |
789 | * Punt after filling in the timer's old value, and | 793 | * Punt after filling in the timer's old value, and |
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, | |||
791 | * it as an overrun (thanks to bump_cpu_timer above). | 795 | * it as an overrun (thanks to bump_cpu_timer above). |
792 | */ | 796 | */ |
793 | read_unlock(&tasklist_lock); | 797 | read_unlock(&tasklist_lock); |
794 | timer->it.cpu.firing = -1; | ||
795 | ret = TIMER_RETRY; | ||
796 | goto out; | 798 | goto out; |
797 | } | 799 | } |
798 | 800 | ||
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) | |||
958 | static void check_thread_timers(struct task_struct *tsk, | 960 | static void check_thread_timers(struct task_struct *tsk, |
959 | struct list_head *firing) | 961 | struct list_head *firing) |
960 | { | 962 | { |
963 | int maxfire; | ||
961 | struct list_head *timers = tsk->cpu_timers; | 964 | struct list_head *timers = tsk->cpu_timers; |
962 | 965 | ||
966 | maxfire = 20; | ||
963 | tsk->it_prof_expires = cputime_zero; | 967 | tsk->it_prof_expires = cputime_zero; |
964 | while (!list_empty(timers)) { | 968 | while (!list_empty(timers)) { |
965 | struct cpu_timer_list *t = list_entry(timers->next, | 969 | struct cpu_timer_list *t = list_entry(timers->next, |
966 | struct cpu_timer_list, | 970 | struct cpu_timer_list, |
967 | entry); | 971 | entry); |
968 | if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { | 972 | if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { |
969 | tsk->it_prof_expires = t->expires.cpu; | 973 | tsk->it_prof_expires = t->expires.cpu; |
970 | break; | 974 | break; |
971 | } | 975 | } |
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
974 | } | 978 | } |
975 | 979 | ||
976 | ++timers; | 980 | ++timers; |
981 | maxfire = 20; | ||
977 | tsk->it_virt_expires = cputime_zero; | 982 | tsk->it_virt_expires = cputime_zero; |
978 | while (!list_empty(timers)) { | 983 | while (!list_empty(timers)) { |
979 | struct cpu_timer_list *t = list_entry(timers->next, | 984 | struct cpu_timer_list *t = list_entry(timers->next, |
980 | struct cpu_timer_list, | 985 | struct cpu_timer_list, |
981 | entry); | 986 | entry); |
982 | if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { | 987 | if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { |
983 | tsk->it_virt_expires = t->expires.cpu; | 988 | tsk->it_virt_expires = t->expires.cpu; |
984 | break; | 989 | break; |
985 | } | 990 | } |
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk, | |||
988 | } | 993 | } |
989 | 994 | ||
990 | ++timers; | 995 | ++timers; |
996 | maxfire = 20; | ||
991 | tsk->it_sched_expires = 0; | 997 | tsk->it_sched_expires = 0; |
992 | while (!list_empty(timers)) { | 998 | while (!list_empty(timers)) { |
993 | struct cpu_timer_list *t = list_entry(timers->next, | 999 | struct cpu_timer_list *t = list_entry(timers->next, |
994 | struct cpu_timer_list, | 1000 | struct cpu_timer_list, |
995 | entry); | 1001 | entry); |
996 | if (tsk->sched_time < t->expires.sched) { | 1002 | if (!--maxfire || tsk->sched_time < t->expires.sched) { |
997 | tsk->it_sched_expires = t->expires.sched; | 1003 | tsk->it_sched_expires = t->expires.sched; |
998 | break; | 1004 | break; |
999 | } | 1005 | } |
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1010 | static void check_process_timers(struct task_struct *tsk, | 1016 | static void check_process_timers(struct task_struct *tsk, |
1011 | struct list_head *firing) | 1017 | struct list_head *firing) |
1012 | { | 1018 | { |
1019 | int maxfire; | ||
1013 | struct signal_struct *const sig = tsk->signal; | 1020 | struct signal_struct *const sig = tsk->signal; |
1014 | cputime_t utime, stime, ptime, virt_expires, prof_expires; | 1021 | cputime_t utime, stime, ptime, virt_expires, prof_expires; |
1015 | unsigned long long sched_time, sched_expires; | 1022 | unsigned long long sched_time, sched_expires; |
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1042 | } while (t != tsk); | 1049 | } while (t != tsk); |
1043 | ptime = cputime_add(utime, stime); | 1050 | ptime = cputime_add(utime, stime); |
1044 | 1051 | ||
1052 | maxfire = 20; | ||
1045 | prof_expires = cputime_zero; | 1053 | prof_expires = cputime_zero; |
1046 | while (!list_empty(timers)) { | 1054 | while (!list_empty(timers)) { |
1047 | struct cpu_timer_list *t = list_entry(timers->next, | 1055 | struct cpu_timer_list *t = list_entry(timers->next, |
1048 | struct cpu_timer_list, | 1056 | struct cpu_timer_list, |
1049 | entry); | 1057 | entry); |
1050 | if (cputime_lt(ptime, t->expires.cpu)) { | 1058 | if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) { |
1051 | prof_expires = t->expires.cpu; | 1059 | prof_expires = t->expires.cpu; |
1052 | break; | 1060 | break; |
1053 | } | 1061 | } |
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1056 | } | 1064 | } |
1057 | 1065 | ||
1058 | ++timers; | 1066 | ++timers; |
1067 | maxfire = 20; | ||
1059 | virt_expires = cputime_zero; | 1068 | virt_expires = cputime_zero; |
1060 | while (!list_empty(timers)) { | 1069 | while (!list_empty(timers)) { |
1061 | struct cpu_timer_list *t = list_entry(timers->next, | 1070 | struct cpu_timer_list *t = list_entry(timers->next, |
1062 | struct cpu_timer_list, | 1071 | struct cpu_timer_list, |
1063 | entry); | 1072 | entry); |
1064 | if (cputime_lt(utime, t->expires.cpu)) { | 1073 | if (!--maxfire || cputime_lt(utime, t->expires.cpu)) { |
1065 | virt_expires = t->expires.cpu; | 1074 | virt_expires = t->expires.cpu; |
1066 | break; | 1075 | break; |
1067 | } | 1076 | } |
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk, | |||
1070 | } | 1079 | } |
1071 | 1080 | ||
1072 | ++timers; | 1081 | ++timers; |
1082 | maxfire = 20; | ||
1073 | sched_expires = 0; | 1083 | sched_expires = 0; |
1074 | while (!list_empty(timers)) { | 1084 | while (!list_empty(timers)) { |
1075 | struct cpu_timer_list *t = list_entry(timers->next, | 1085 | struct cpu_timer_list *t = list_entry(timers->next, |
1076 | struct cpu_timer_list, | 1086 | struct cpu_timer_list, |
1077 | entry); | 1087 | entry); |
1078 | if (sched_time < t->expires.sched) { | 1088 | if (!--maxfire || sched_time < t->expires.sched) { |
1079 | sched_expires = t->expires.sched; | 1089 | sched_expires = t->expires.sched; |
1080 | break; | 1090 | break; |
1081 | } | 1091 | } |
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk, | |||
1158 | unsigned long long sched_left, sched; | 1168 | unsigned long long sched_left, sched; |
1159 | const unsigned int nthreads = atomic_read(&sig->live); | 1169 | const unsigned int nthreads = atomic_read(&sig->live); |
1160 | 1170 | ||
1171 | if (!nthreads) | ||
1172 | return; | ||
1173 | |||
1161 | prof_left = cputime_sub(prof_expires, utime); | 1174 | prof_left = cputime_sub(prof_expires, utime); |
1162 | prof_left = cputime_sub(prof_left, stime); | 1175 | prof_left = cputime_sub(prof_left, stime); |
1163 | prof_left = cputime_div(prof_left, nthreads); | 1176 | prof_left = cputime_div(prof_left, nthreads); |
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk, | |||
1194 | 1207 | ||
1195 | do { | 1208 | do { |
1196 | t = next_thread(t); | 1209 | t = next_thread(t); |
1197 | } while (unlikely(t->exit_state)); | 1210 | } while (unlikely(t->flags & PF_EXITING)); |
1198 | } while (t != tsk); | 1211 | } while (t != tsk); |
1199 | } | 1212 | } |
1200 | } | 1213 | } |
@@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1212 | /* | 1225 | /* |
1213 | * The task was cleaned up already, no future firings. | 1226 | * The task was cleaned up already, no future firings. |
1214 | */ | 1227 | */ |
1215 | return; | 1228 | goto out; |
1216 | 1229 | ||
1217 | /* | 1230 | /* |
1218 | * Fetch the current sample and update the timer's expiry time. | 1231 | * Fetch the current sample and update the timer's expiry time. |
@@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1222 | bump_cpu_timer(timer, now); | 1235 | bump_cpu_timer(timer, now); |
1223 | if (unlikely(p->exit_state)) { | 1236 | if (unlikely(p->exit_state)) { |
1224 | clear_dead_task(timer, now); | 1237 | clear_dead_task(timer, now); |
1225 | return; | 1238 | goto out; |
1226 | } | 1239 | } |
1227 | read_lock(&tasklist_lock); /* arm_timer needs it. */ | 1240 | read_lock(&tasklist_lock); /* arm_timer needs it. */ |
1228 | } else { | 1241 | } else { |
@@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1235 | put_task_struct(p); | 1248 | put_task_struct(p); |
1236 | timer->it.cpu.task = p = NULL; | 1249 | timer->it.cpu.task = p = NULL; |
1237 | timer->it.cpu.expires.sched = 0; | 1250 | timer->it.cpu.expires.sched = 0; |
1238 | read_unlock(&tasklist_lock); | 1251 | goto out_unlock; |
1239 | return; | ||
1240 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { | 1252 | } else if (unlikely(p->exit_state) && thread_group_empty(p)) { |
1241 | /* | 1253 | /* |
1242 | * We've noticed that the thread is dead, but | 1254 | * We've noticed that the thread is dead, but |
@@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1244 | * drop our task ref. | 1256 | * drop our task ref. |
1245 | */ | 1257 | */ |
1246 | clear_dead_task(timer, now); | 1258 | clear_dead_task(timer, now); |
1247 | read_unlock(&tasklist_lock); | 1259 | goto out_unlock; |
1248 | return; | ||
1249 | } | 1260 | } |
1250 | cpu_clock_sample_group(timer->it_clock, p, &now); | 1261 | cpu_clock_sample_group(timer->it_clock, p, &now); |
1251 | bump_cpu_timer(timer, now); | 1262 | bump_cpu_timer(timer, now); |
@@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) | |||
1257 | */ | 1268 | */ |
1258 | arm_timer(timer, now); | 1269 | arm_timer(timer, now); |
1259 | 1270 | ||
1271 | out_unlock: | ||
1260 | read_unlock(&tasklist_lock); | 1272 | read_unlock(&tasklist_lock); |
1273 | |||
1274 | out: | ||
1275 | timer->it_overrun_last = timer->it_overrun; | ||
1276 | timer->it_overrun = -1; | ||
1277 | ++timer->it_requeue_pending; | ||
1261 | } | 1278 | } |
1262 | 1279 | ||
1263 | /* | 1280 | /* |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index b7b532acd9fc..5870efb3e200 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -270,7 +270,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff) | |||
270 | long sec = tp->tv_sec; | 270 | long sec = tp->tv_sec; |
271 | long nsec = tp->tv_nsec + res - 1; | 271 | long nsec = tp->tv_nsec + res - 1; |
272 | 272 | ||
273 | if (nsec > NSEC_PER_SEC) { | 273 | if (nsec >= NSEC_PER_SEC) { |
274 | sec++; | 274 | sec++; |
275 | nsec -= NSEC_PER_SEC; | 275 | nsec -= NSEC_PER_SEC; |
276 | } | 276 | } |
@@ -1157,7 +1157,7 @@ retry_delete: | |||
1157 | } | 1157 | } |
1158 | 1158 | ||
1159 | /* | 1159 | /* |
1160 | * This is called by __exit_signal, only when there are no more | 1160 | * This is called by do_exit or de_thread, only when there are no more |
1161 | * references to the shared signal_struct. | 1161 | * references to the shared signal_struct. |
1162 | */ | 1162 | */ |
1163 | void exit_itimers(struct signal_struct *sig) | 1163 | void exit_itimers(struct signal_struct *sig) |
@@ -1209,13 +1209,9 @@ static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp) | |||
1209 | 1209 | ||
1210 | do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); | 1210 | do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); |
1211 | 1211 | ||
1212 | tp->tv_sec += wall_to_mono.tv_sec; | 1212 | set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec, |
1213 | tp->tv_nsec += wall_to_mono.tv_nsec; | 1213 | tp->tv_nsec + wall_to_mono.tv_nsec); |
1214 | 1214 | ||
1215 | if ((tp->tv_nsec - NSEC_PER_SEC) > 0) { | ||
1216 | tp->tv_nsec -= NSEC_PER_SEC; | ||
1217 | tp->tv_sec++; | ||
1218 | } | ||
1219 | return 0; | 1215 | return 0; |
1220 | } | 1216 | } |
1221 | 1217 | ||
@@ -1295,13 +1291,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp) | |||
1295 | return error; | 1291 | return error; |
1296 | } | 1292 | } |
1297 | 1293 | ||
1298 | static void nanosleep_wake_up(unsigned long __data) | ||
1299 | { | ||
1300 | struct task_struct *p = (struct task_struct *) __data; | ||
1301 | |||
1302 | wake_up_process(p); | ||
1303 | } | ||
1304 | |||
1305 | /* | 1294 | /* |
1306 | * The standard says that an absolute nanosleep call MUST wake up at | 1295 | * The standard says that an absolute nanosleep call MUST wake up at |
1307 | * the requested time in spite of clock settings. Here is what we do: | 1296 | * the requested time in spite of clock settings. Here is what we do: |
@@ -1442,7 +1431,6 @@ static int common_nsleep(clockid_t which_clock, | |||
1442 | int flags, struct timespec *tsave) | 1431 | int flags, struct timespec *tsave) |
1443 | { | 1432 | { |
1444 | struct timespec t, dum; | 1433 | struct timespec t, dum; |
1445 | struct timer_list new_timer; | ||
1446 | DECLARE_WAITQUEUE(abs_wqueue, current); | 1434 | DECLARE_WAITQUEUE(abs_wqueue, current); |
1447 | u64 rq_time = (u64)0; | 1435 | u64 rq_time = (u64)0; |
1448 | s64 left; | 1436 | s64 left; |
@@ -1451,10 +1439,6 @@ static int common_nsleep(clockid_t which_clock, | |||
1451 | ¤t_thread_info()->restart_block; | 1439 | ¤t_thread_info()->restart_block; |
1452 | 1440 | ||
1453 | abs_wqueue.flags = 0; | 1441 | abs_wqueue.flags = 0; |
1454 | init_timer(&new_timer); | ||
1455 | new_timer.expires = 0; | ||
1456 | new_timer.data = (unsigned long) current; | ||
1457 | new_timer.function = nanosleep_wake_up; | ||
1458 | abs = flags & TIMER_ABSTIME; | 1442 | abs = flags & TIMER_ABSTIME; |
1459 | 1443 | ||
1460 | if (restart_block->fn == clock_nanosleep_restart) { | 1444 | if (restart_block->fn == clock_nanosleep_restart) { |
@@ -1490,13 +1474,8 @@ static int common_nsleep(clockid_t which_clock, | |||
1490 | if (left < (s64)0) | 1474 | if (left < (s64)0) |
1491 | break; | 1475 | break; |
1492 | 1476 | ||
1493 | new_timer.expires = jiffies + left; | 1477 | schedule_timeout_interruptible(left); |
1494 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1495 | add_timer(&new_timer); | ||
1496 | |||
1497 | schedule(); | ||
1498 | 1478 | ||
1499 | del_timer_sync(&new_timer); | ||
1500 | left = rq_time - get_jiffies_64(); | 1479 | left = rq_time - get_jiffies_64(); |
1501 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); | 1480 | } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); |
1502 | 1481 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 396c7873e804..5ec248cb7f4a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -19,6 +19,15 @@ config PM | |||
19 | will issue the hlt instruction if nothing is to be done, thereby | 19 | will issue the hlt instruction if nothing is to be done, thereby |
20 | sending the processor to sleep and saving power. | 20 | sending the processor to sleep and saving power. |
21 | 21 | ||
22 | config PM_LEGACY | ||
23 | bool "Legacy Power Management API" | ||
24 | depends on PM | ||
25 | default y | ||
26 | ---help--- | ||
27 | Support for pm_register() and friends. | ||
28 | |||
29 | If unsure, say Y. | ||
30 | |||
22 | config PM_DEBUG | 31 | config PM_DEBUG |
23 | bool "Power Management Debug Support" | 32 | bool "Power Management Debug Support" |
24 | depends on PM | 33 | depends on PM |
@@ -29,7 +38,7 @@ config PM_DEBUG | |||
29 | 38 | ||
30 | config SOFTWARE_SUSPEND | 39 | config SOFTWARE_SUSPEND |
31 | bool "Software Suspend" | 40 | bool "Software Suspend" |
32 | depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) | 41 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP) |
33 | ---help--- | 42 | ---help--- |
34 | Enable the possibility of suspending the machine. | 43 | Enable the possibility of suspending the machine. |
35 | It doesn't need APM. | 44 | It doesn't need APM. |
diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 2f438d0eaa13..04be7d0d96a7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile | |||
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y) | |||
3 | EXTRA_CFLAGS += -DDEBUG | 3 | EXTRA_CFLAGS += -DDEBUG |
4 | endif | 4 | endif |
5 | 5 | ||
6 | obj-y := main.o process.o console.o pm.o | 6 | obj-y := main.o process.o console.o |
7 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o | 7 | obj-$(CONFIG_PM_LEGACY) += pm.o |
8 | obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o | ||
8 | 9 | ||
9 | obj-$(CONFIG_SUSPEND_SMP) += smp.o | 10 | obj-$(CONFIG_SUSPEND_SMP) += smp.o |
10 | 11 | ||
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 2d8bf054d036..027322a564f4 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -17,12 +17,12 @@ | |||
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
19 | #include <linux/mount.h> | 19 | #include <linux/mount.h> |
20 | #include <linux/pm.h> | ||
20 | 21 | ||
21 | #include "power.h" | 22 | #include "power.h" |
22 | 23 | ||
23 | 24 | ||
24 | extern suspend_disk_method_t pm_disk_mode; | 25 | extern suspend_disk_method_t pm_disk_mode; |
25 | extern struct pm_ops * pm_ops; | ||
26 | 26 | ||
27 | extern int swsusp_suspend(void); | 27 | extern int swsusp_suspend(void); |
28 | extern int swsusp_write(void); | 28 | extern int swsusp_write(void); |
@@ -30,7 +30,6 @@ extern int swsusp_check(void); | |||
30 | extern int swsusp_read(void); | 30 | extern int swsusp_read(void); |
31 | extern void swsusp_close(void); | 31 | extern void swsusp_close(void); |
32 | extern int swsusp_resume(void); | 32 | extern int swsusp_resume(void); |
33 | extern int swsusp_free(void); | ||
34 | 33 | ||
35 | 34 | ||
36 | static int noresume = 0; | 35 | static int noresume = 0; |
@@ -49,13 +48,11 @@ dev_t swsusp_resume_device; | |||
49 | 48 | ||
50 | static void power_down(suspend_disk_method_t mode) | 49 | static void power_down(suspend_disk_method_t mode) |
51 | { | 50 | { |
52 | unsigned long flags; | ||
53 | int error = 0; | 51 | int error = 0; |
54 | 52 | ||
55 | local_irq_save(flags); | ||
56 | switch(mode) { | 53 | switch(mode) { |
57 | case PM_DISK_PLATFORM: | 54 | case PM_DISK_PLATFORM: |
58 | device_shutdown(); | 55 | kernel_power_off_prepare(); |
59 | error = pm_ops->enter(PM_SUSPEND_DISK); | 56 | error = pm_ops->enter(PM_SUSPEND_DISK); |
60 | break; | 57 | break; |
61 | case PM_DISK_SHUTDOWN: | 58 | case PM_DISK_SHUTDOWN: |
@@ -95,10 +92,7 @@ static void free_some_memory(void) | |||
95 | printk("Freeing memory... "); | 92 | printk("Freeing memory... "); |
96 | while ((tmp = shrink_all_memory(10000))) { | 93 | while ((tmp = shrink_all_memory(10000))) { |
97 | pages += tmp; | 94 | pages += tmp; |
98 | printk("\b%c", p[i]); | 95 | printk("\b%c", p[i++ % 4]); |
99 | i++; | ||
100 | if (i > 3) | ||
101 | i = 0; | ||
102 | } | 96 | } |
103 | printk("\bdone (%li pages freed)\n", pages); | 97 | printk("\bdone (%li pages freed)\n", pages); |
104 | } | 98 | } |
@@ -180,13 +174,12 @@ int pm_suspend_disk(void) | |||
180 | goto Done; | 174 | goto Done; |
181 | 175 | ||
182 | if (in_suspend) { | 176 | if (in_suspend) { |
177 | device_resume(); | ||
183 | pr_debug("PM: writing image.\n"); | 178 | pr_debug("PM: writing image.\n"); |
184 | error = swsusp_write(); | 179 | error = swsusp_write(); |
185 | if (!error) | 180 | if (!error) |
186 | power_down(pm_disk_mode); | 181 | power_down(pm_disk_mode); |
187 | else { | 182 | else { |
188 | /* swsusp_write can not fail in device_resume, | ||
189 | no need to do second device_resume */ | ||
190 | swsusp_free(); | 183 | swsusp_free(); |
191 | unprepare_processes(); | 184 | unprepare_processes(); |
192 | return error; | 185 | return error; |
@@ -254,14 +247,17 @@ static int software_resume(void) | |||
254 | 247 | ||
255 | pr_debug("PM: Reading swsusp image.\n"); | 248 | pr_debug("PM: Reading swsusp image.\n"); |
256 | 249 | ||
257 | if ((error = swsusp_read())) | 250 | if ((error = swsusp_read())) { |
258 | goto Cleanup; | 251 | swsusp_free(); |
252 | goto Thaw; | ||
253 | } | ||
259 | 254 | ||
260 | pr_debug("PM: Preparing devices for restore.\n"); | 255 | pr_debug("PM: Preparing devices for restore.\n"); |
261 | 256 | ||
262 | if ((error = device_suspend(PMSG_FREEZE))) { | 257 | if ((error = device_suspend(PMSG_FREEZE))) { |
263 | printk("Some devices failed to suspend\n"); | 258 | printk("Some devices failed to suspend\n"); |
264 | goto Free; | 259 | swsusp_free(); |
260 | goto Thaw; | ||
265 | } | 261 | } |
266 | 262 | ||
267 | mb(); | 263 | mb(); |
@@ -270,9 +266,7 @@ static int software_resume(void) | |||
270 | swsusp_resume(); | 266 | swsusp_resume(); |
271 | pr_debug("PM: Restore failed, recovering.n"); | 267 | pr_debug("PM: Restore failed, recovering.n"); |
272 | device_resume(); | 268 | device_resume(); |
273 | Free: | 269 | Thaw: |
274 | swsusp_free(); | ||
275 | Cleanup: | ||
276 | unprepare_processes(); | 270 | unprepare_processes(); |
277 | Done: | 271 | Done: |
278 | /* For success case, the suspend path will release the lock */ | 272 | /* For success case, the suspend path will release the lock */ |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 22bdc93cc038..d253f3ae2fa5 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | DECLARE_MUTEX(pm_sem); | 25 | DECLARE_MUTEX(pm_sem); |
26 | 26 | ||
27 | struct pm_ops * pm_ops = NULL; | 27 | struct pm_ops *pm_ops; |
28 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | 28 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; |
29 | 29 | ||
30 | /** | 30 | /** |
@@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = { | |||
151 | #endif | 151 | #endif |
152 | }; | 152 | }; |
153 | 153 | ||
154 | static inline int valid_state(suspend_state_t state) | ||
155 | { | ||
156 | /* Suspend-to-disk does not really need low-level support. | ||
157 | * It can work with reboot if needed. */ | ||
158 | if (state == PM_SUSPEND_DISK) | ||
159 | return 1; | ||
160 | |||
161 | if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) | ||
162 | return 0; | ||
163 | return 1; | ||
164 | } | ||
165 | |||
154 | 166 | ||
155 | /** | 167 | /** |
156 | * enter_state - Do common work of entering low-power state. | 168 | * enter_state - Do common work of entering low-power state. |
@@ -167,6 +179,8 @@ static int enter_state(suspend_state_t state) | |||
167 | { | 179 | { |
168 | int error; | 180 | int error; |
169 | 181 | ||
182 | if (!valid_state(state)) | ||
183 | return -ENODEV; | ||
170 | if (down_trylock(&pm_sem)) | 184 | if (down_trylock(&pm_sem)) |
171 | return -EBUSY; | 185 | return -EBUSY; |
172 | 186 | ||
@@ -236,8 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) | |||
236 | char * s = buf; | 250 | char * s = buf; |
237 | 251 | ||
238 | for (i = 0; i < PM_SUSPEND_MAX; i++) { | 252 | for (i = 0; i < PM_SUSPEND_MAX; i++) { |
239 | if (pm_states[i]) | 253 | if (pm_states[i] && valid_state(i)) |
240 | s += sprintf(s,"%s ",pm_states[i]); | 254 | s += sprintf(s,"%s ", pm_states[i]); |
241 | } | 255 | } |
242 | s += sprintf(s,"\n"); | 256 | s += sprintf(s,"\n"); |
243 | return (s - buf); | 257 | return (s - buf); |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 159149321b3c..33c508e857dd 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/pm.h> | 25 | #include <linux/pm.h> |
26 | #include <linux/pm_legacy.h> | ||
26 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
27 | 28 | ||
28 | int pm_active; | 29 | int pm_active; |
diff --git a/kernel/power/power.h b/kernel/power/power.h index cd6a3493cc0d..6c042b5ee14b 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -1,7 +1,7 @@ | |||
1 | #include <linux/suspend.h> | 1 | #include <linux/suspend.h> |
2 | #include <linux/utsname.h> | 2 | #include <linux/utsname.h> |
3 | 3 | ||
4 | /* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but | 4 | /* With SUSPEND_CONSOLE defined suspend looks *really* cool, but |
5 | we probably do not take enough locks for switching consoles, etc, | 5 | we probably do not take enough locks for switching consoles, etc, |
6 | so bad things might happen. | 6 | so bad things might happen. |
7 | */ | 7 | */ |
@@ -9,6 +9,9 @@ | |||
9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) | 9 | #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) |
10 | #endif | 10 | #endif |
11 | 11 | ||
12 | #define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ | ||
13 | - 4 - 3*sizeof(unsigned long) - sizeof(int) \ | ||
14 | - sizeof(void *)) / sizeof(swp_entry_t)) | ||
12 | 15 | ||
13 | struct swsusp_info { | 16 | struct swsusp_info { |
14 | struct new_utsname uts; | 17 | struct new_utsname uts; |
@@ -18,7 +21,7 @@ struct swsusp_info { | |||
18 | unsigned long image_pages; | 21 | unsigned long image_pages; |
19 | unsigned long pagedir_pages; | 22 | unsigned long pagedir_pages; |
20 | suspend_pagedir_t * suspend_pagedir; | 23 | suspend_pagedir_t * suspend_pagedir; |
21 | swp_entry_t pagedir[768]; | 24 | swp_entry_t pagedir[MAX_PBES]; |
22 | } __attribute__((aligned(PAGE_SIZE))); | 25 | } __attribute__((aligned(PAGE_SIZE))); |
23 | 26 | ||
24 | 27 | ||
@@ -50,3 +53,20 @@ extern void thaw_processes(void); | |||
50 | 53 | ||
51 | extern int pm_prepare_console(void); | 54 | extern int pm_prepare_console(void); |
52 | extern void pm_restore_console(void); | 55 | extern void pm_restore_console(void); |
56 | |||
57 | |||
58 | /* References to section boundaries */ | ||
59 | extern const void __nosave_begin, __nosave_end; | ||
60 | |||
61 | extern unsigned int nr_copy_pages; | ||
62 | extern suspend_pagedir_t *pagedir_nosave; | ||
63 | extern suspend_pagedir_t *pagedir_save; | ||
64 | |||
65 | extern asmlinkage int swsusp_arch_suspend(void); | ||
66 | extern asmlinkage int swsusp_arch_resume(void); | ||
67 | |||
68 | extern void free_pagedir(struct pbe *pblist); | ||
69 | extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); | ||
70 | extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); | ||
71 | extern void swsusp_free(void); | ||
72 | extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); | ||
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 000000000000..4a6dbcefd378 --- /dev/null +++ b/kernel/power/snapshot.c | |||
@@ -0,0 +1,453 @@ | |||
1 | /* | ||
2 | * linux/kernel/power/snapshot.c | ||
3 | * | ||
4 | * This file provide system snapshot/restore functionality. | ||
5 | * | ||
6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | ||
7 | * | ||
8 | * This file is released under the GPLv2, and is based on swsusp.c. | ||
9 | * | ||
10 | */ | ||
11 | |||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/mm.h> | ||
15 | #include <linux/suspend.h> | ||
16 | #include <linux/smp_lock.h> | ||
17 | #include <linux/delay.h> | ||
18 | #include <linux/bitops.h> | ||
19 | #include <linux/spinlock.h> | ||
20 | #include <linux/kernel.h> | ||
21 | #include <linux/pm.h> | ||
22 | #include <linux/device.h> | ||
23 | #include <linux/bootmem.h> | ||
24 | #include <linux/syscalls.h> | ||
25 | #include <linux/console.h> | ||
26 | #include <linux/highmem.h> | ||
27 | |||
28 | #include <asm/uaccess.h> | ||
29 | #include <asm/mmu_context.h> | ||
30 | #include <asm/pgtable.h> | ||
31 | #include <asm/tlbflush.h> | ||
32 | #include <asm/io.h> | ||
33 | |||
34 | #include "power.h" | ||
35 | |||
36 | #ifdef CONFIG_HIGHMEM | ||
37 | struct highmem_page { | ||
38 | char *data; | ||
39 | struct page *page; | ||
40 | struct highmem_page *next; | ||
41 | }; | ||
42 | |||
43 | static struct highmem_page *highmem_copy; | ||
44 | |||
45 | static int save_highmem_zone(struct zone *zone) | ||
46 | { | ||
47 | unsigned long zone_pfn; | ||
48 | mark_free_pages(zone); | ||
49 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
50 | struct page *page; | ||
51 | struct highmem_page *save; | ||
52 | void *kaddr; | ||
53 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
54 | |||
55 | if (!(pfn%1000)) | ||
56 | printk("."); | ||
57 | if (!pfn_valid(pfn)) | ||
58 | continue; | ||
59 | page = pfn_to_page(pfn); | ||
60 | /* | ||
61 | * This condition results from rvmalloc() sans vmalloc_32() | ||
62 | * and architectural memory reservations. This should be | ||
63 | * corrected eventually when the cases giving rise to this | ||
64 | * are better understood. | ||
65 | */ | ||
66 | if (PageReserved(page)) { | ||
67 | printk("highmem reserved page?!\n"); | ||
68 | continue; | ||
69 | } | ||
70 | BUG_ON(PageNosave(page)); | ||
71 | if (PageNosaveFree(page)) | ||
72 | continue; | ||
73 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
74 | if (!save) | ||
75 | return -ENOMEM; | ||
76 | save->next = highmem_copy; | ||
77 | save->page = page; | ||
78 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
79 | if (!save->data) { | ||
80 | kfree(save); | ||
81 | return -ENOMEM; | ||
82 | } | ||
83 | kaddr = kmap_atomic(page, KM_USER0); | ||
84 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
85 | kunmap_atomic(kaddr, KM_USER0); | ||
86 | highmem_copy = save; | ||
87 | } | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | int save_highmem(void) | ||
92 | { | ||
93 | struct zone *zone; | ||
94 | int res = 0; | ||
95 | |||
96 | pr_debug("swsusp: Saving Highmem\n"); | ||
97 | for_each_zone (zone) { | ||
98 | if (is_highmem(zone)) | ||
99 | res = save_highmem_zone(zone); | ||
100 | if (res) | ||
101 | return res; | ||
102 | } | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | int restore_highmem(void) | ||
107 | { | ||
108 | printk("swsusp: Restoring Highmem\n"); | ||
109 | while (highmem_copy) { | ||
110 | struct highmem_page *save = highmem_copy; | ||
111 | void *kaddr; | ||
112 | highmem_copy = save->next; | ||
113 | |||
114 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
115 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
116 | kunmap_atomic(kaddr, KM_USER0); | ||
117 | free_page((long) save->data); | ||
118 | kfree(save); | ||
119 | } | ||
120 | return 0; | ||
121 | } | ||
122 | #endif | ||
123 | |||
124 | static int pfn_is_nosave(unsigned long pfn) | ||
125 | { | ||
126 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
127 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
128 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
129 | } | ||
130 | |||
131 | /** | ||
132 | * saveable - Determine whether a page should be cloned or not. | ||
133 | * @pfn: The page | ||
134 | * | ||
135 | * We save a page if it's Reserved, and not in the range of pages | ||
136 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
137 | * isn't part of a free chunk of pages. | ||
138 | */ | ||
139 | |||
140 | static int saveable(struct zone *zone, unsigned long *zone_pfn) | ||
141 | { | ||
142 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
143 | struct page *page; | ||
144 | |||
145 | if (!pfn_valid(pfn)) | ||
146 | return 0; | ||
147 | |||
148 | page = pfn_to_page(pfn); | ||
149 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
150 | if (PageNosave(page)) | ||
151 | return 0; | ||
152 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
153 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
154 | return 0; | ||
155 | } | ||
156 | if (PageNosaveFree(page)) | ||
157 | return 0; | ||
158 | |||
159 | return 1; | ||
160 | } | ||
161 | |||
162 | static unsigned count_data_pages(void) | ||
163 | { | ||
164 | struct zone *zone; | ||
165 | unsigned long zone_pfn; | ||
166 | unsigned int n = 0; | ||
167 | |||
168 | for_each_zone (zone) { | ||
169 | if (is_highmem(zone)) | ||
170 | continue; | ||
171 | mark_free_pages(zone); | ||
172 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
173 | n += saveable(zone, &zone_pfn); | ||
174 | } | ||
175 | return n; | ||
176 | } | ||
177 | |||
178 | static void copy_data_pages(struct pbe *pblist) | ||
179 | { | ||
180 | struct zone *zone; | ||
181 | unsigned long zone_pfn; | ||
182 | struct pbe *pbe, *p; | ||
183 | |||
184 | pbe = pblist; | ||
185 | for_each_zone (zone) { | ||
186 | if (is_highmem(zone)) | ||
187 | continue; | ||
188 | mark_free_pages(zone); | ||
189 | /* This is necessary for swsusp_free() */ | ||
190 | for_each_pb_page (p, pblist) | ||
191 | SetPageNosaveFree(virt_to_page(p)); | ||
192 | for_each_pbe (p, pblist) | ||
193 | SetPageNosaveFree(virt_to_page(p->address)); | ||
194 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
195 | if (saveable(zone, &zone_pfn)) { | ||
196 | struct page *page; | ||
197 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
198 | BUG_ON(!pbe); | ||
199 | pbe->orig_address = (unsigned long)page_address(page); | ||
200 | /* copy_page is not usable for copying task structs. */ | ||
201 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
202 | pbe = pbe->next; | ||
203 | } | ||
204 | } | ||
205 | } | ||
206 | BUG_ON(pbe); | ||
207 | } | ||
208 | |||
209 | |||
210 | /** | ||
211 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
212 | */ | ||
213 | |||
214 | void free_pagedir(struct pbe *pblist) | ||
215 | { | ||
216 | struct pbe *pbe; | ||
217 | |||
218 | while (pblist) { | ||
219 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
220 | ClearPageNosave(virt_to_page(pblist)); | ||
221 | ClearPageNosaveFree(virt_to_page(pblist)); | ||
222 | free_page((unsigned long)pblist); | ||
223 | pblist = pbe; | ||
224 | } | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
229 | */ | ||
230 | |||
231 | static inline void fill_pb_page(struct pbe *pbpage) | ||
232 | { | ||
233 | struct pbe *p; | ||
234 | |||
235 | p = pbpage; | ||
236 | pbpage += PB_PAGE_SKIP; | ||
237 | do | ||
238 | p->next = p + 1; | ||
239 | while (++p < pbpage); | ||
240 | } | ||
241 | |||
242 | /** | ||
243 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
244 | * of memory pages allocated with alloc_pagedir() | ||
245 | */ | ||
246 | |||
247 | void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) | ||
248 | { | ||
249 | struct pbe *pbpage, *p; | ||
250 | unsigned int num = PBES_PER_PAGE; | ||
251 | |||
252 | for_each_pb_page (pbpage, pblist) { | ||
253 | if (num >= nr_pages) | ||
254 | break; | ||
255 | |||
256 | fill_pb_page(pbpage); | ||
257 | num += PBES_PER_PAGE; | ||
258 | } | ||
259 | if (pbpage) { | ||
260 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
261 | p->next = p + 1; | ||
262 | p->next = NULL; | ||
263 | } | ||
264 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
265 | } | ||
266 | |||
267 | /** | ||
268 | * @safe_needed - on resume, for storing the PBE list and the image, | ||
269 | * we can only use memory pages that do not conflict with the pages | ||
270 | * which had been used before suspend. | ||
271 | * | ||
272 | * The unsafe pages are marked with the PG_nosave_free flag | ||
273 | * | ||
274 | * Allocated but unusable (ie eaten) memory pages should be marked | ||
275 | * so that swsusp_free() can release them | ||
276 | */ | ||
277 | |||
278 | static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | ||
279 | { | ||
280 | void *res; | ||
281 | |||
282 | if (safe_needed) | ||
283 | do { | ||
284 | res = (void *)get_zeroed_page(gfp_mask); | ||
285 | if (res && PageNosaveFree(virt_to_page(res))) | ||
286 | /* This is for swsusp_free() */ | ||
287 | SetPageNosave(virt_to_page(res)); | ||
288 | } while (res && PageNosaveFree(virt_to_page(res))); | ||
289 | else | ||
290 | res = (void *)get_zeroed_page(gfp_mask); | ||
291 | if (res) { | ||
292 | SetPageNosave(virt_to_page(res)); | ||
293 | SetPageNosaveFree(virt_to_page(res)); | ||
294 | } | ||
295 | return res; | ||
296 | } | ||
297 | |||
298 | unsigned long get_safe_page(gfp_t gfp_mask) | ||
299 | { | ||
300 | return (unsigned long)alloc_image_page(gfp_mask, 1); | ||
301 | } | ||
302 | |||
303 | /** | ||
304 | * alloc_pagedir - Allocate the page directory. | ||
305 | * | ||
306 | * First, determine exactly how many pages we need and | ||
307 | * allocate them. | ||
308 | * | ||
309 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
310 | * struct pbe elements (pbes) and the last element in the page points | ||
311 | * to the next page. | ||
312 | * | ||
313 | * On each page we set up a list of struct_pbe elements. | ||
314 | */ | ||
315 | |||
316 | struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) | ||
317 | { | ||
318 | unsigned int num; | ||
319 | struct pbe *pblist, *pbe; | ||
320 | |||
321 | if (!nr_pages) | ||
322 | return NULL; | ||
323 | |||
324 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
325 | pblist = alloc_image_page(gfp_mask, safe_needed); | ||
326 | /* FIXME: rewrite this ugly loop */ | ||
327 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
328 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
329 | pbe += PB_PAGE_SKIP; | ||
330 | pbe->next = alloc_image_page(gfp_mask, safe_needed); | ||
331 | } | ||
332 | if (!pbe) { /* get_zeroed_page() failed */ | ||
333 | free_pagedir(pblist); | ||
334 | pblist = NULL; | ||
335 | } | ||
336 | return pblist; | ||
337 | } | ||
338 | |||
339 | /** | ||
340 | * Free pages we allocated for suspend. Suspend pages are alocated | ||
341 | * before atomic copy, so we need to free them after resume. | ||
342 | */ | ||
343 | |||
344 | void swsusp_free(void) | ||
345 | { | ||
346 | struct zone *zone; | ||
347 | unsigned long zone_pfn; | ||
348 | |||
349 | for_each_zone(zone) { | ||
350 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
351 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { | ||
352 | struct page *page; | ||
353 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
354 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
355 | ClearPageNosave(page); | ||
356 | ClearPageNosaveFree(page); | ||
357 | free_page((long) page_address(page)); | ||
358 | } | ||
359 | } | ||
360 | } | ||
361 | } | ||
362 | |||
363 | |||
364 | /** | ||
365 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
366 | * | ||
367 | * Returns TRUE or FALSE after checking the number of available | ||
368 | * free pages. | ||
369 | */ | ||
370 | |||
371 | static int enough_free_mem(unsigned int nr_pages) | ||
372 | { | ||
373 | pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); | ||
374 | return nr_free_pages() > (nr_pages + PAGES_FOR_IO + | ||
375 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
376 | } | ||
377 | |||
378 | int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) | ||
379 | { | ||
380 | struct pbe *p; | ||
381 | |||
382 | for_each_pbe (p, pblist) { | ||
383 | p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed); | ||
384 | if (!p->address) | ||
385 | return -ENOMEM; | ||
386 | } | ||
387 | return 0; | ||
388 | } | ||
389 | |||
390 | static struct pbe *swsusp_alloc(unsigned int nr_pages) | ||
391 | { | ||
392 | struct pbe *pblist; | ||
393 | |||
394 | if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { | ||
395 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
396 | return NULL; | ||
397 | } | ||
398 | create_pbe_list(pblist, nr_pages); | ||
399 | |||
400 | if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { | ||
401 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
402 | swsusp_free(); | ||
403 | return NULL; | ||
404 | } | ||
405 | |||
406 | return pblist; | ||
407 | } | ||
408 | |||
409 | asmlinkage int swsusp_save(void) | ||
410 | { | ||
411 | unsigned int nr_pages; | ||
412 | |||
413 | pr_debug("swsusp: critical section: \n"); | ||
414 | |||
415 | drain_local_pages(); | ||
416 | nr_pages = count_data_pages(); | ||
417 | printk("swsusp: Need to copy %u pages\n", nr_pages); | ||
418 | |||
419 | pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n", | ||
420 | nr_pages, | ||
421 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, | ||
422 | PAGES_FOR_IO, nr_free_pages()); | ||
423 | |||
424 | /* This is needed because of the fixed size of swsusp_info */ | ||
425 | if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) | ||
426 | return -ENOSPC; | ||
427 | |||
428 | if (!enough_free_mem(nr_pages)) { | ||
429 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | ||
430 | return -ENOMEM; | ||
431 | } | ||
432 | |||
433 | pagedir_nosave = swsusp_alloc(nr_pages); | ||
434 | if (!pagedir_nosave) | ||
435 | return -ENOMEM; | ||
436 | |||
437 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
438 | * Kill them. | ||
439 | */ | ||
440 | drain_local_pages(); | ||
441 | copy_data_pages(pagedir_nosave); | ||
442 | |||
443 | /* | ||
444 | * End of critical section. From now on, we can write to memory, | ||
445 | * but we should not touch disk. This specially means we must _not_ | ||
446 | * touch swap space! Except we must write out our image of course. | ||
447 | */ | ||
448 | |||
449 | nr_copy_pages = nr_pages; | ||
450 | |||
451 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | ||
452 | return 0; | ||
453 | } | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index d967e875ee82..c05f46e7348f 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -1,11 +1,10 @@ | |||
1 | /* | 1 | /* |
2 | * linux/kernel/power/swsusp.c | 2 | * linux/kernel/power/swsusp.c |
3 | * | 3 | * |
4 | * This file is to realize architecture-independent | 4 | * This file provides code to write suspend image to swap and read it back. |
5 | * machine suspend feature using pretty near only high-level routines | ||
6 | * | 5 | * |
7 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> | 6 | * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> |
8 | * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> | 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> |
9 | * | 8 | * |
10 | * This file is released under the GPLv2. | 9 | * This file is released under the GPLv2. |
11 | * | 10 | * |
@@ -47,11 +46,7 @@ | |||
47 | #include <linux/utsname.h> | 46 | #include <linux/utsname.h> |
48 | #include <linux/version.h> | 47 | #include <linux/version.h> |
49 | #include <linux/delay.h> | 48 | #include <linux/delay.h> |
50 | #include <linux/reboot.h> | ||
51 | #include <linux/bitops.h> | 49 | #include <linux/bitops.h> |
52 | #include <linux/vt_kern.h> | ||
53 | #include <linux/kbd_kern.h> | ||
54 | #include <linux/keyboard.h> | ||
55 | #include <linux/spinlock.h> | 50 | #include <linux/spinlock.h> |
56 | #include <linux/genhd.h> | 51 | #include <linux/genhd.h> |
57 | #include <linux/kernel.h> | 52 | #include <linux/kernel.h> |
@@ -63,10 +58,8 @@ | |||
63 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
64 | #include <linux/bootmem.h> | 59 | #include <linux/bootmem.h> |
65 | #include <linux/syscalls.h> | 60 | #include <linux/syscalls.h> |
66 | #include <linux/console.h> | ||
67 | #include <linux/highmem.h> | 61 | #include <linux/highmem.h> |
68 | #include <linux/bio.h> | 62 | #include <linux/bio.h> |
69 | #include <linux/mount.h> | ||
70 | 63 | ||
71 | #include <asm/uaccess.h> | 64 | #include <asm/uaccess.h> |
72 | #include <asm/mmu_context.h> | 65 | #include <asm/mmu_context.h> |
@@ -80,36 +73,31 @@ | |||
80 | 73 | ||
81 | #include "power.h" | 74 | #include "power.h" |
82 | 75 | ||
76 | #ifdef CONFIG_HIGHMEM | ||
77 | int save_highmem(void); | ||
78 | int restore_highmem(void); | ||
79 | #else | ||
80 | static int save_highmem(void) { return 0; } | ||
81 | static int restore_highmem(void) { return 0; } | ||
82 | #endif | ||
83 | |||
83 | #define CIPHER "aes" | 84 | #define CIPHER "aes" |
84 | #define MAXKEY 32 | 85 | #define MAXKEY 32 |
85 | #define MAXIV 32 | 86 | #define MAXIV 32 |
86 | 87 | ||
87 | /* References to section boundaries */ | ||
88 | extern const void __nosave_begin, __nosave_end; | ||
89 | |||
90 | /* Variables to be preserved over suspend */ | ||
91 | static int nr_copy_pages_check; | ||
92 | |||
93 | extern char resume_file[]; | 88 | extern char resume_file[]; |
94 | 89 | ||
95 | /* Local variables that should not be affected by save */ | 90 | /* Local variables that should not be affected by save */ |
96 | static unsigned int nr_copy_pages __nosavedata = 0; | 91 | unsigned int nr_copy_pages __nosavedata = 0; |
97 | 92 | ||
98 | /* Suspend pagedir is allocated before final copy, therefore it | 93 | /* Suspend pagedir is allocated before final copy, therefore it |
99 | must be freed after resume | 94 | must be freed after resume |
100 | 95 | ||
101 | Warning: this is evil. There are actually two pagedirs at time of | ||
102 | resume. One is "pagedir_save", which is empty frame allocated at | ||
103 | time of suspend, that must be freed. Second is "pagedir_nosave", | ||
104 | allocated at time of resume, that travels through memory not to | ||
105 | collide with anything. | ||
106 | |||
107 | Warning: this is even more evil than it seems. Pagedirs this file | 96 | Warning: this is even more evil than it seems. Pagedirs this file |
108 | talks about are completely different from page directories used by | 97 | talks about are completely different from page directories used by |
109 | MMU hardware. | 98 | MMU hardware. |
110 | */ | 99 | */ |
111 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; | 100 | suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; |
112 | static suspend_pagedir_t *pagedir_save; | ||
113 | 101 | ||
114 | #define SWSUSP_SIG "S1SUSPEND" | 102 | #define SWSUSP_SIG "S1SUSPEND" |
115 | 103 | ||
@@ -124,12 +112,6 @@ static struct swsusp_header { | |||
124 | static struct swsusp_info swsusp_info; | 112 | static struct swsusp_info swsusp_info; |
125 | 113 | ||
126 | /* | 114 | /* |
127 | * XXX: We try to keep some more pages free so that I/O operations succeed | ||
128 | * without paging. Might this be more? | ||
129 | */ | ||
130 | #define PAGES_FOR_IO 512 | ||
131 | |||
132 | /* | ||
133 | * Saving part... | 115 | * Saving part... |
134 | */ | 116 | */ |
135 | 117 | ||
@@ -141,8 +123,8 @@ static struct swsusp_info swsusp_info; | |||
141 | static unsigned short swapfile_used[MAX_SWAPFILES]; | 123 | static unsigned short swapfile_used[MAX_SWAPFILES]; |
142 | static unsigned short root_swap; | 124 | static unsigned short root_swap; |
143 | 125 | ||
144 | static int write_page(unsigned long addr, swp_entry_t * loc); | 126 | static int write_page(unsigned long addr, swp_entry_t *loc); |
145 | static int bio_read_page(pgoff_t page_off, void * page); | 127 | static int bio_read_page(pgoff_t page_off, void *page); |
146 | 128 | ||
147 | static u8 key_iv[MAXKEY+MAXIV]; | 129 | static u8 key_iv[MAXKEY+MAXIV]; |
148 | 130 | ||
@@ -363,7 +345,7 @@ static void lock_swapdevices(void) | |||
363 | } | 345 | } |
364 | 346 | ||
365 | /** | 347 | /** |
366 | * write_swap_page - Write one page to a fresh swap location. | 348 | * write_page - Write one page to a fresh swap location. |
367 | * @addr: Address we're writing. | 349 | * @addr: Address we're writing. |
368 | * @loc: Place to store the entry we used. | 350 | * @loc: Place to store the entry we used. |
369 | * | 351 | * |
@@ -374,7 +356,7 @@ static void lock_swapdevices(void) | |||
374 | * This is a partial improvement, since we will at least return other | 356 | * This is a partial improvement, since we will at least return other |
375 | * errors, though we need to eventually fix the damn code. | 357 | * errors, though we need to eventually fix the damn code. |
376 | */ | 358 | */ |
377 | static int write_page(unsigned long addr, swp_entry_t * loc) | 359 | static int write_page(unsigned long addr, swp_entry_t *loc) |
378 | { | 360 | { |
379 | swp_entry_t entry; | 361 | swp_entry_t entry; |
380 | int error = 0; | 362 | int error = 0; |
@@ -402,15 +384,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc) | |||
402 | static void data_free(void) | 384 | static void data_free(void) |
403 | { | 385 | { |
404 | swp_entry_t entry; | 386 | swp_entry_t entry; |
405 | int i; | 387 | struct pbe *p; |
406 | 388 | ||
407 | for (i = 0; i < nr_copy_pages; i++) { | 389 | for_each_pbe (p, pagedir_nosave) { |
408 | entry = (pagedir_nosave + i)->swap_address; | 390 | entry = p->swap_address; |
409 | if (entry.val) | 391 | if (entry.val) |
410 | swap_free(entry); | 392 | swap_free(entry); |
411 | else | 393 | else |
412 | break; | 394 | break; |
413 | (pagedir_nosave + i)->swap_address = (swp_entry_t){0}; | ||
414 | } | 395 | } |
415 | } | 396 | } |
416 | 397 | ||
@@ -512,8 +493,8 @@ static void free_pagedir_entries(void) | |||
512 | static int write_pagedir(void) | 493 | static int write_pagedir(void) |
513 | { | 494 | { |
514 | int error = 0; | 495 | int error = 0; |
515 | unsigned n = 0; | 496 | unsigned int n = 0; |
516 | struct pbe * pbe; | 497 | struct pbe *pbe; |
517 | 498 | ||
518 | printk( "Writing pagedir..."); | 499 | printk( "Writing pagedir..."); |
519 | for_each_pb_page (pbe, pagedir_nosave) { | 500 | for_each_pb_page (pbe, pagedir_nosave) { |
@@ -527,6 +508,26 @@ static int write_pagedir(void) | |||
527 | } | 508 | } |
528 | 509 | ||
529 | /** | 510 | /** |
511 | * enough_swap - Make sure we have enough swap to save the image. | ||
512 | * | ||
513 | * Returns TRUE or FALSE after checking the total amount of swap | ||
514 | * space avaiable. | ||
515 | * | ||
516 | * FIXME: si_swapinfo(&i) returns all swap devices information. | ||
517 | * We should only consider resume_device. | ||
518 | */ | ||
519 | |||
520 | static int enough_swap(unsigned int nr_pages) | ||
521 | { | ||
522 | struct sysinfo i; | ||
523 | |||
524 | si_swapinfo(&i); | ||
525 | pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); | ||
526 | return i.freeswap > (nr_pages + PAGES_FOR_IO + | ||
527 | (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); | ||
528 | } | ||
529 | |||
530 | /** | ||
530 | * write_suspend_image - Write entire image and metadata. | 531 | * write_suspend_image - Write entire image and metadata. |
531 | * | 532 | * |
532 | */ | 533 | */ |
@@ -534,6 +535,11 @@ static int write_suspend_image(void) | |||
534 | { | 535 | { |
535 | int error; | 536 | int error; |
536 | 537 | ||
538 | if (!enough_swap(nr_copy_pages)) { | ||
539 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | ||
540 | return -ENOSPC; | ||
541 | } | ||
542 | |||
537 | init_header(); | 543 | init_header(); |
538 | if ((error = data_write())) | 544 | if ((error = data_write())) |
539 | goto FreeData; | 545 | goto FreeData; |
@@ -553,433 +559,6 @@ static int write_suspend_image(void) | |||
553 | goto Done; | 559 | goto Done; |
554 | } | 560 | } |
555 | 561 | ||
556 | |||
557 | #ifdef CONFIG_HIGHMEM | ||
558 | struct highmem_page { | ||
559 | char *data; | ||
560 | struct page *page; | ||
561 | struct highmem_page *next; | ||
562 | }; | ||
563 | |||
564 | static struct highmem_page *highmem_copy; | ||
565 | |||
566 | static int save_highmem_zone(struct zone *zone) | ||
567 | { | ||
568 | unsigned long zone_pfn; | ||
569 | mark_free_pages(zone); | ||
570 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
571 | struct page *page; | ||
572 | struct highmem_page *save; | ||
573 | void *kaddr; | ||
574 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
575 | |||
576 | if (!(pfn%1000)) | ||
577 | printk("."); | ||
578 | if (!pfn_valid(pfn)) | ||
579 | continue; | ||
580 | page = pfn_to_page(pfn); | ||
581 | /* | ||
582 | * This condition results from rvmalloc() sans vmalloc_32() | ||
583 | * and architectural memory reservations. This should be | ||
584 | * corrected eventually when the cases giving rise to this | ||
585 | * are better understood. | ||
586 | */ | ||
587 | if (PageReserved(page)) { | ||
588 | printk("highmem reserved page?!\n"); | ||
589 | continue; | ||
590 | } | ||
591 | BUG_ON(PageNosave(page)); | ||
592 | if (PageNosaveFree(page)) | ||
593 | continue; | ||
594 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
595 | if (!save) | ||
596 | return -ENOMEM; | ||
597 | save->next = highmem_copy; | ||
598 | save->page = page; | ||
599 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
600 | if (!save->data) { | ||
601 | kfree(save); | ||
602 | return -ENOMEM; | ||
603 | } | ||
604 | kaddr = kmap_atomic(page, KM_USER0); | ||
605 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
606 | kunmap_atomic(kaddr, KM_USER0); | ||
607 | highmem_copy = save; | ||
608 | } | ||
609 | return 0; | ||
610 | } | ||
611 | #endif /* CONFIG_HIGHMEM */ | ||
612 | |||
613 | |||
614 | static int save_highmem(void) | ||
615 | { | ||
616 | #ifdef CONFIG_HIGHMEM | ||
617 | struct zone *zone; | ||
618 | int res = 0; | ||
619 | |||
620 | pr_debug("swsusp: Saving Highmem\n"); | ||
621 | for_each_zone (zone) { | ||
622 | if (is_highmem(zone)) | ||
623 | res = save_highmem_zone(zone); | ||
624 | if (res) | ||
625 | return res; | ||
626 | } | ||
627 | #endif | ||
628 | return 0; | ||
629 | } | ||
630 | |||
631 | static int restore_highmem(void) | ||
632 | { | ||
633 | #ifdef CONFIG_HIGHMEM | ||
634 | printk("swsusp: Restoring Highmem\n"); | ||
635 | while (highmem_copy) { | ||
636 | struct highmem_page *save = highmem_copy; | ||
637 | void *kaddr; | ||
638 | highmem_copy = save->next; | ||
639 | |||
640 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
641 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
642 | kunmap_atomic(kaddr, KM_USER0); | ||
643 | free_page((long) save->data); | ||
644 | kfree(save); | ||
645 | } | ||
646 | #endif | ||
647 | return 0; | ||
648 | } | ||
649 | |||
650 | |||
651 | static int pfn_is_nosave(unsigned long pfn) | ||
652 | { | ||
653 | unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; | ||
654 | unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; | ||
655 | return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); | ||
656 | } | ||
657 | |||
658 | /** | ||
659 | * saveable - Determine whether a page should be cloned or not. | ||
660 | * @pfn: The page | ||
661 | * | ||
662 | * We save a page if it's Reserved, and not in the range of pages | ||
663 | * statically defined as 'unsaveable', or if it isn't reserved, and | ||
664 | * isn't part of a free chunk of pages. | ||
665 | */ | ||
666 | |||
667 | static int saveable(struct zone * zone, unsigned long * zone_pfn) | ||
668 | { | ||
669 | unsigned long pfn = *zone_pfn + zone->zone_start_pfn; | ||
670 | struct page * page; | ||
671 | |||
672 | if (!pfn_valid(pfn)) | ||
673 | return 0; | ||
674 | |||
675 | page = pfn_to_page(pfn); | ||
676 | BUG_ON(PageReserved(page) && PageNosave(page)); | ||
677 | if (PageNosave(page)) | ||
678 | return 0; | ||
679 | if (PageReserved(page) && pfn_is_nosave(pfn)) { | ||
680 | pr_debug("[nosave pfn 0x%lx]", pfn); | ||
681 | return 0; | ||
682 | } | ||
683 | if (PageNosaveFree(page)) | ||
684 | return 0; | ||
685 | |||
686 | return 1; | ||
687 | } | ||
688 | |||
689 | static void count_data_pages(void) | ||
690 | { | ||
691 | struct zone *zone; | ||
692 | unsigned long zone_pfn; | ||
693 | |||
694 | nr_copy_pages = 0; | ||
695 | |||
696 | for_each_zone (zone) { | ||
697 | if (is_highmem(zone)) | ||
698 | continue; | ||
699 | mark_free_pages(zone); | ||
700 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | ||
701 | nr_copy_pages += saveable(zone, &zone_pfn); | ||
702 | } | ||
703 | } | ||
704 | |||
705 | |||
706 | static void copy_data_pages(void) | ||
707 | { | ||
708 | struct zone *zone; | ||
709 | unsigned long zone_pfn; | ||
710 | struct pbe * pbe = pagedir_nosave; | ||
711 | |||
712 | pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages); | ||
713 | for_each_zone (zone) { | ||
714 | if (is_highmem(zone)) | ||
715 | continue; | ||
716 | mark_free_pages(zone); | ||
717 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
718 | if (saveable(zone, &zone_pfn)) { | ||
719 | struct page * page; | ||
720 | page = pfn_to_page(zone_pfn + zone->zone_start_pfn); | ||
721 | BUG_ON(!pbe); | ||
722 | pbe->orig_address = (long) page_address(page); | ||
723 | /* copy_page is not usable for copying task structs. */ | ||
724 | memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); | ||
725 | pbe = pbe->next; | ||
726 | } | ||
727 | } | ||
728 | } | ||
729 | BUG_ON(pbe); | ||
730 | } | ||
731 | |||
732 | |||
733 | /** | ||
734 | * calc_nr - Determine the number of pages needed for a pbe list. | ||
735 | */ | ||
736 | |||
737 | static int calc_nr(int nr_copy) | ||
738 | { | ||
739 | return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); | ||
740 | } | ||
741 | |||
742 | /** | ||
743 | * free_pagedir - free pages allocated with alloc_pagedir() | ||
744 | */ | ||
745 | |||
746 | static inline void free_pagedir(struct pbe *pblist) | ||
747 | { | ||
748 | struct pbe *pbe; | ||
749 | |||
750 | while (pblist) { | ||
751 | pbe = (pblist + PB_PAGE_SKIP)->next; | ||
752 | free_page((unsigned long)pblist); | ||
753 | pblist = pbe; | ||
754 | } | ||
755 | } | ||
756 | |||
757 | /** | ||
758 | * fill_pb_page - Create a list of PBEs on a given memory page | ||
759 | */ | ||
760 | |||
761 | static inline void fill_pb_page(struct pbe *pbpage) | ||
762 | { | ||
763 | struct pbe *p; | ||
764 | |||
765 | p = pbpage; | ||
766 | pbpage += PB_PAGE_SKIP; | ||
767 | do | ||
768 | p->next = p + 1; | ||
769 | while (++p < pbpage); | ||
770 | } | ||
771 | |||
772 | /** | ||
773 | * create_pbe_list - Create a list of PBEs on top of a given chain | ||
774 | * of memory pages allocated with alloc_pagedir() | ||
775 | */ | ||
776 | |||
777 | static void create_pbe_list(struct pbe *pblist, unsigned nr_pages) | ||
778 | { | ||
779 | struct pbe *pbpage, *p; | ||
780 | unsigned num = PBES_PER_PAGE; | ||
781 | |||
782 | for_each_pb_page (pbpage, pblist) { | ||
783 | if (num >= nr_pages) | ||
784 | break; | ||
785 | |||
786 | fill_pb_page(pbpage); | ||
787 | num += PBES_PER_PAGE; | ||
788 | } | ||
789 | if (pbpage) { | ||
790 | for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++) | ||
791 | p->next = p + 1; | ||
792 | p->next = NULL; | ||
793 | } | ||
794 | pr_debug("create_pbe_list(): initialized %d PBEs\n", num); | ||
795 | } | ||
796 | |||
797 | /** | ||
798 | * alloc_pagedir - Allocate the page directory. | ||
799 | * | ||
800 | * First, determine exactly how many pages we need and | ||
801 | * allocate them. | ||
802 | * | ||
803 | * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE | ||
804 | * struct pbe elements (pbes) and the last element in the page points | ||
805 | * to the next page. | ||
806 | * | ||
807 | * On each page we set up a list of struct_pbe elements. | ||
808 | */ | ||
809 | |||
810 | static struct pbe * alloc_pagedir(unsigned nr_pages) | ||
811 | { | ||
812 | unsigned num; | ||
813 | struct pbe *pblist, *pbe; | ||
814 | |||
815 | if (!nr_pages) | ||
816 | return NULL; | ||
817 | |||
818 | pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages); | ||
819 | pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
820 | for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; | ||
821 | pbe = pbe->next, num += PBES_PER_PAGE) { | ||
822 | pbe += PB_PAGE_SKIP; | ||
823 | pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
824 | } | ||
825 | if (!pbe) { /* get_zeroed_page() failed */ | ||
826 | free_pagedir(pblist); | ||
827 | pblist = NULL; | ||
828 | } | ||
829 | return pblist; | ||
830 | } | ||
831 | |||
832 | /** | ||
833 | * free_image_pages - Free pages allocated for snapshot | ||
834 | */ | ||
835 | |||
836 | static void free_image_pages(void) | ||
837 | { | ||
838 | struct pbe * p; | ||
839 | |||
840 | for_each_pbe (p, pagedir_save) { | ||
841 | if (p->address) { | ||
842 | ClearPageNosave(virt_to_page(p->address)); | ||
843 | free_page(p->address); | ||
844 | p->address = 0; | ||
845 | } | ||
846 | } | ||
847 | } | ||
848 | |||
849 | /** | ||
850 | * alloc_image_pages - Allocate pages for the snapshot. | ||
851 | */ | ||
852 | |||
853 | static int alloc_image_pages(void) | ||
854 | { | ||
855 | struct pbe * p; | ||
856 | |||
857 | for_each_pbe (p, pagedir_save) { | ||
858 | p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD); | ||
859 | if (!p->address) | ||
860 | return -ENOMEM; | ||
861 | SetPageNosave(virt_to_page(p->address)); | ||
862 | } | ||
863 | return 0; | ||
864 | } | ||
865 | |||
866 | void swsusp_free(void) | ||
867 | { | ||
868 | BUG_ON(PageNosave(virt_to_page(pagedir_save))); | ||
869 | BUG_ON(PageNosaveFree(virt_to_page(pagedir_save))); | ||
870 | free_image_pages(); | ||
871 | free_pagedir(pagedir_save); | ||
872 | } | ||
873 | |||
874 | |||
875 | /** | ||
876 | * enough_free_mem - Make sure we enough free memory to snapshot. | ||
877 | * | ||
878 | * Returns TRUE or FALSE after checking the number of available | ||
879 | * free pages. | ||
880 | */ | ||
881 | |||
882 | static int enough_free_mem(void) | ||
883 | { | ||
884 | if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) { | ||
885 | pr_debug("swsusp: Not enough free pages: Have %d\n", | ||
886 | nr_free_pages()); | ||
887 | return 0; | ||
888 | } | ||
889 | return 1; | ||
890 | } | ||
891 | |||
892 | |||
893 | /** | ||
894 | * enough_swap - Make sure we have enough swap to save the image. | ||
895 | * | ||
896 | * Returns TRUE or FALSE after checking the total amount of swap | ||
897 | * space avaiable. | ||
898 | * | ||
899 | * FIXME: si_swapinfo(&i) returns all swap devices information. | ||
900 | * We should only consider resume_device. | ||
901 | */ | ||
902 | |||
903 | static int enough_swap(void) | ||
904 | { | ||
905 | struct sysinfo i; | ||
906 | |||
907 | si_swapinfo(&i); | ||
908 | if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) { | ||
909 | pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap); | ||
910 | return 0; | ||
911 | } | ||
912 | return 1; | ||
913 | } | ||
914 | |||
915 | static int swsusp_alloc(void) | ||
916 | { | ||
917 | int error; | ||
918 | |||
919 | pagedir_nosave = NULL; | ||
920 | nr_copy_pages = calc_nr(nr_copy_pages); | ||
921 | |||
922 | pr_debug("suspend: (pages needed: %d + %d free: %d)\n", | ||
923 | nr_copy_pages, PAGES_FOR_IO, nr_free_pages()); | ||
924 | |||
925 | if (!enough_free_mem()) | ||
926 | return -ENOMEM; | ||
927 | |||
928 | if (!enough_swap()) | ||
929 | return -ENOSPC; | ||
930 | |||
931 | if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) { | ||
932 | printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); | ||
933 | return -ENOMEM; | ||
934 | } | ||
935 | create_pbe_list(pagedir_save, nr_copy_pages); | ||
936 | pagedir_nosave = pagedir_save; | ||
937 | if ((error = alloc_image_pages())) { | ||
938 | printk(KERN_ERR "suspend: Allocating image pages failed.\n"); | ||
939 | swsusp_free(); | ||
940 | return error; | ||
941 | } | ||
942 | |||
943 | nr_copy_pages_check = nr_copy_pages; | ||
944 | return 0; | ||
945 | } | ||
946 | |||
947 | static int suspend_prepare_image(void) | ||
948 | { | ||
949 | int error; | ||
950 | |||
951 | pr_debug("swsusp: critical section: \n"); | ||
952 | if (save_highmem()) { | ||
953 | printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n"); | ||
954 | restore_highmem(); | ||
955 | return -ENOMEM; | ||
956 | } | ||
957 | |||
958 | drain_local_pages(); | ||
959 | count_data_pages(); | ||
960 | printk("swsusp: Need to copy %u pages\n", nr_copy_pages); | ||
961 | |||
962 | error = swsusp_alloc(); | ||
963 | if (error) | ||
964 | return error; | ||
965 | |||
966 | /* During allocating of suspend pagedir, new cold pages may appear. | ||
967 | * Kill them. | ||
968 | */ | ||
969 | drain_local_pages(); | ||
970 | copy_data_pages(); | ||
971 | |||
972 | /* | ||
973 | * End of critical section. From now on, we can write to memory, | ||
974 | * but we should not touch disk. This specially means we must _not_ | ||
975 | * touch swap space! Except we must write out our image of course. | ||
976 | */ | ||
977 | |||
978 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages ); | ||
979 | return 0; | ||
980 | } | ||
981 | |||
982 | |||
983 | /* It is important _NOT_ to umount filesystems at this point. We want | 562 | /* It is important _NOT_ to umount filesystems at this point. We want |
984 | * them synced (in case something goes wrong) but we DO not want to mark | 563 | * them synced (in case something goes wrong) but we DO not want to mark |
985 | * filesystem clean: it is not. (And it does not matter, if we resume | 564 | * filesystem clean: it is not. (And it does not matter, if we resume |
@@ -988,28 +567,24 @@ static int suspend_prepare_image(void) | |||
988 | int swsusp_write(void) | 567 | int swsusp_write(void) |
989 | { | 568 | { |
990 | int error; | 569 | int error; |
991 | device_resume(); | 570 | |
571 | if ((error = swsusp_swap_check())) { | ||
572 | printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); | ||
573 | return error; | ||
574 | } | ||
992 | lock_swapdevices(); | 575 | lock_swapdevices(); |
993 | error = write_suspend_image(); | 576 | error = write_suspend_image(); |
994 | /* This will unlock ignored swap devices since writing is finished */ | 577 | /* This will unlock ignored swap devices since writing is finished */ |
995 | lock_swapdevices(); | 578 | lock_swapdevices(); |
996 | return error; | 579 | return error; |
997 | |||
998 | } | 580 | } |
999 | 581 | ||
1000 | 582 | ||
1001 | extern asmlinkage int swsusp_arch_suspend(void); | ||
1002 | extern asmlinkage int swsusp_arch_resume(void); | ||
1003 | |||
1004 | |||
1005 | asmlinkage int swsusp_save(void) | ||
1006 | { | ||
1007 | return suspend_prepare_image(); | ||
1008 | } | ||
1009 | 583 | ||
1010 | int swsusp_suspend(void) | 584 | int swsusp_suspend(void) |
1011 | { | 585 | { |
1012 | int error; | 586 | int error; |
587 | |||
1013 | if ((error = arch_prepare_suspend())) | 588 | if ((error = arch_prepare_suspend())) |
1014 | return error; | 589 | return error; |
1015 | local_irq_disable(); | 590 | local_irq_disable(); |
@@ -1021,15 +596,12 @@ int swsusp_suspend(void) | |||
1021 | */ | 596 | */ |
1022 | if ((error = device_power_down(PMSG_FREEZE))) { | 597 | if ((error = device_power_down(PMSG_FREEZE))) { |
1023 | printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); | 598 | printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); |
1024 | local_irq_enable(); | 599 | goto Enable_irqs; |
1025 | return error; | ||
1026 | } | 600 | } |
1027 | 601 | ||
1028 | if ((error = swsusp_swap_check())) { | 602 | if ((error = save_highmem())) { |
1029 | printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); | 603 | printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); |
1030 | device_power_up(); | 604 | goto Restore_highmem; |
1031 | local_irq_enable(); | ||
1032 | return error; | ||
1033 | } | 605 | } |
1034 | 606 | ||
1035 | save_processor_state(); | 607 | save_processor_state(); |
@@ -1037,9 +609,10 @@ int swsusp_suspend(void) | |||
1037 | printk(KERN_ERR "Error %d suspending\n", error); | 609 | printk(KERN_ERR "Error %d suspending\n", error); |
1038 | /* Restore control flow magically appears here */ | 610 | /* Restore control flow magically appears here */ |
1039 | restore_processor_state(); | 611 | restore_processor_state(); |
1040 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | 612 | Restore_highmem: |
1041 | restore_highmem(); | 613 | restore_highmem(); |
1042 | device_power_up(); | 614 | device_power_up(); |
615 | Enable_irqs: | ||
1043 | local_irq_enable(); | 616 | local_irq_enable(); |
1044 | return error; | 617 | return error; |
1045 | } | 618 | } |
@@ -1057,6 +630,11 @@ int swsusp_resume(void) | |||
1057 | * execution continues at place where swsusp_arch_suspend was called | 630 | * execution continues at place where swsusp_arch_suspend was called |
1058 | */ | 631 | */ |
1059 | BUG_ON(!error); | 632 | BUG_ON(!error); |
633 | /* The only reason why swsusp_arch_resume() can fail is memory being | ||
634 | * very tight, so we have to free it as soon as we can to avoid | ||
635 | * subsequent failures | ||
636 | */ | ||
637 | swsusp_free(); | ||
1060 | restore_processor_state(); | 638 | restore_processor_state(); |
1061 | restore_highmem(); | 639 | restore_highmem(); |
1062 | touch_softlockup_watchdog(); | 640 | touch_softlockup_watchdog(); |
@@ -1066,158 +644,43 @@ int swsusp_resume(void) | |||
1066 | } | 644 | } |
1067 | 645 | ||
1068 | /** | 646 | /** |
1069 | * On resume, for storing the PBE list and the image, | 647 | * mark_unsafe_pages - mark the pages that cannot be used for storing |
1070 | * we can only use memory pages that do not conflict with the pages | 648 | * the image during resume, because they conflict with the pages that |
1071 | * which had been used before suspend. | 649 | * had been used before suspend |
1072 | * | ||
1073 | * We don't know which pages are usable until we allocate them. | ||
1074 | * | ||
1075 | * Allocated but unusable (ie eaten) memory pages are linked together | ||
1076 | * to create a list, so that we can free them easily | ||
1077 | * | ||
1078 | * We could have used a type other than (void *) | ||
1079 | * for this purpose, but ... | ||
1080 | */ | 650 | */ |
1081 | static void **eaten_memory = NULL; | ||
1082 | |||
1083 | static inline void eat_page(void *page) | ||
1084 | { | ||
1085 | void **c; | ||
1086 | 651 | ||
1087 | c = eaten_memory; | 652 | static void mark_unsafe_pages(struct pbe *pblist) |
1088 | eaten_memory = page; | ||
1089 | *eaten_memory = c; | ||
1090 | } | ||
1091 | |||
1092 | static unsigned long get_usable_page(unsigned gfp_mask) | ||
1093 | { | ||
1094 | unsigned long m; | ||
1095 | |||
1096 | m = get_zeroed_page(gfp_mask); | ||
1097 | while (!PageNosaveFree(virt_to_page(m))) { | ||
1098 | eat_page((void *)m); | ||
1099 | m = get_zeroed_page(gfp_mask); | ||
1100 | if (!m) | ||
1101 | break; | ||
1102 | } | ||
1103 | return m; | ||
1104 | } | ||
1105 | |||
1106 | static void free_eaten_memory(void) | ||
1107 | { | ||
1108 | unsigned long m; | ||
1109 | void **c; | ||
1110 | int i = 0; | ||
1111 | |||
1112 | c = eaten_memory; | ||
1113 | while (c) { | ||
1114 | m = (unsigned long)c; | ||
1115 | c = *c; | ||
1116 | free_page(m); | ||
1117 | i++; | ||
1118 | } | ||
1119 | eaten_memory = NULL; | ||
1120 | pr_debug("swsusp: %d unused pages freed\n", i); | ||
1121 | } | ||
1122 | |||
1123 | /** | ||
1124 | * check_pagedir - We ensure here that pages that the PBEs point to | ||
1125 | * won't collide with pages where we're going to restore from the loaded | ||
1126 | * pages later | ||
1127 | */ | ||
1128 | |||
1129 | static int check_pagedir(struct pbe *pblist) | ||
1130 | { | ||
1131 | struct pbe *p; | ||
1132 | |||
1133 | /* This is necessary, so that we can free allocated pages | ||
1134 | * in case of failure | ||
1135 | */ | ||
1136 | for_each_pbe (p, pblist) | ||
1137 | p->address = 0UL; | ||
1138 | |||
1139 | for_each_pbe (p, pblist) { | ||
1140 | p->address = get_usable_page(GFP_ATOMIC); | ||
1141 | if (!p->address) | ||
1142 | return -ENOMEM; | ||
1143 | } | ||
1144 | return 0; | ||
1145 | } | ||
1146 | |||
1147 | /** | ||
1148 | * swsusp_pagedir_relocate - It is possible, that some memory pages | ||
1149 | * occupied by the list of PBEs collide with pages where we're going to | ||
1150 | * restore from the loaded pages later. We relocate them here. | ||
1151 | */ | ||
1152 | |||
1153 | static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | ||
1154 | { | 653 | { |
1155 | struct zone *zone; | 654 | struct zone *zone; |
1156 | unsigned long zone_pfn; | 655 | unsigned long zone_pfn; |
1157 | struct pbe *pbpage, *tail, *p; | 656 | struct pbe *p; |
1158 | void *m; | ||
1159 | int rel = 0, error = 0; | ||
1160 | 657 | ||
1161 | if (!pblist) /* a sanity check */ | 658 | if (!pblist) /* a sanity check */ |
1162 | return NULL; | 659 | return; |
1163 | |||
1164 | pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n", | ||
1165 | swsusp_info.pagedir_pages); | ||
1166 | |||
1167 | /* Set page flags */ | ||
1168 | 660 | ||
661 | /* Clear page flags */ | ||
1169 | for_each_zone (zone) { | 662 | for_each_zone (zone) { |
1170 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) | 663 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) |
1171 | SetPageNosaveFree(pfn_to_page(zone_pfn + | 664 | if (pfn_valid(zone_pfn + zone->zone_start_pfn)) |
665 | ClearPageNosaveFree(pfn_to_page(zone_pfn + | ||
1172 | zone->zone_start_pfn)); | 666 | zone->zone_start_pfn)); |
1173 | } | 667 | } |
1174 | 668 | ||
1175 | /* Clear orig addresses */ | 669 | /* Mark orig addresses */ |
1176 | |||
1177 | for_each_pbe (p, pblist) | 670 | for_each_pbe (p, pblist) |
1178 | ClearPageNosaveFree(virt_to_page(p->orig_address)); | 671 | SetPageNosaveFree(virt_to_page(p->orig_address)); |
1179 | |||
1180 | tail = pblist + PB_PAGE_SKIP; | ||
1181 | |||
1182 | /* Relocate colliding pages */ | ||
1183 | |||
1184 | for_each_pb_page (pbpage, pblist) { | ||
1185 | if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) { | ||
1186 | m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD); | ||
1187 | if (!m) { | ||
1188 | error = -ENOMEM; | ||
1189 | break; | ||
1190 | } | ||
1191 | memcpy(m, (void *)pbpage, PAGE_SIZE); | ||
1192 | if (pbpage == pblist) | ||
1193 | pblist = (struct pbe *)m; | ||
1194 | else | ||
1195 | tail->next = (struct pbe *)m; | ||
1196 | |||
1197 | eat_page((void *)pbpage); | ||
1198 | pbpage = (struct pbe *)m; | ||
1199 | |||
1200 | /* We have to link the PBEs again */ | ||
1201 | 672 | ||
1202 | for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) | 673 | } |
1203 | if (p->next) /* needed to save the end */ | ||
1204 | p->next = p + 1; | ||
1205 | |||
1206 | rel++; | ||
1207 | } | ||
1208 | tail = pbpage + PB_PAGE_SKIP; | ||
1209 | } | ||
1210 | 674 | ||
1211 | if (error) { | 675 | static void copy_page_backup_list(struct pbe *dst, struct pbe *src) |
1212 | printk("\nswsusp: Out of memory\n\n"); | 676 | { |
1213 | free_pagedir(pblist); | 677 | /* We assume both lists contain the same number of elements */ |
1214 | free_eaten_memory(); | 678 | while (src) { |
1215 | pblist = NULL; | 679 | dst->orig_address = src->orig_address; |
680 | dst->swap_address = src->swap_address; | ||
681 | dst = dst->next; | ||
682 | src = src->next; | ||
1216 | } | 683 | } |
1217 | else | ||
1218 | printk("swsusp: Relocated %d pages\n", rel); | ||
1219 | |||
1220 | return pblist; | ||
1221 | } | 684 | } |
1222 | 685 | ||
1223 | /* | 686 | /* |
@@ -1231,7 +694,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist) | |||
1231 | 694 | ||
1232 | static atomic_t io_done = ATOMIC_INIT(0); | 695 | static atomic_t io_done = ATOMIC_INIT(0); |
1233 | 696 | ||
1234 | static int end_io(struct bio * bio, unsigned int num, int err) | 697 | static int end_io(struct bio *bio, unsigned int num, int err) |
1235 | { | 698 | { |
1236 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) | 699 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1237 | panic("I/O error reading memory image"); | 700 | panic("I/O error reading memory image"); |
@@ -1239,7 +702,7 @@ static int end_io(struct bio * bio, unsigned int num, int err) | |||
1239 | return 0; | 702 | return 0; |
1240 | } | 703 | } |
1241 | 704 | ||
1242 | static struct block_device * resume_bdev; | 705 | static struct block_device *resume_bdev; |
1243 | 706 | ||
1244 | /** | 707 | /** |
1245 | * submit - submit BIO request. | 708 | * submit - submit BIO request. |
@@ -1252,10 +715,10 @@ static struct block_device * resume_bdev; | |||
1252 | * Then submit it and wait. | 715 | * Then submit it and wait. |
1253 | */ | 716 | */ |
1254 | 717 | ||
1255 | static int submit(int rw, pgoff_t page_off, void * page) | 718 | static int submit(int rw, pgoff_t page_off, void *page) |
1256 | { | 719 | { |
1257 | int error = 0; | 720 | int error = 0; |
1258 | struct bio * bio; | 721 | struct bio *bio; |
1259 | 722 | ||
1260 | bio = bio_alloc(GFP_ATOMIC, 1); | 723 | bio = bio_alloc(GFP_ATOMIC, 1); |
1261 | if (!bio) | 724 | if (!bio) |
@@ -1284,12 +747,12 @@ static int submit(int rw, pgoff_t page_off, void * page) | |||
1284 | return error; | 747 | return error; |
1285 | } | 748 | } |
1286 | 749 | ||
1287 | static int bio_read_page(pgoff_t page_off, void * page) | 750 | static int bio_read_page(pgoff_t page_off, void *page) |
1288 | { | 751 | { |
1289 | return submit(READ, page_off, page); | 752 | return submit(READ, page_off, page); |
1290 | } | 753 | } |
1291 | 754 | ||
1292 | static int bio_write_page(pgoff_t page_off, void * page) | 755 | static int bio_write_page(pgoff_t page_off, void *page) |
1293 | { | 756 | { |
1294 | return submit(WRITE, page_off, page); | 757 | return submit(WRITE, page_off, page); |
1295 | } | 758 | } |
@@ -1299,7 +762,7 @@ static int bio_write_page(pgoff_t page_off, void * page) | |||
1299 | * I really don't think that it's foolproof but more than nothing.. | 762 | * I really don't think that it's foolproof but more than nothing.. |
1300 | */ | 763 | */ |
1301 | 764 | ||
1302 | static const char * sanity_check(void) | 765 | static const char *sanity_check(void) |
1303 | { | 766 | { |
1304 | dump_info(); | 767 | dump_info(); |
1305 | if (swsusp_info.version_code != LINUX_VERSION_CODE) | 768 | if (swsusp_info.version_code != LINUX_VERSION_CODE) |
@@ -1325,7 +788,7 @@ static const char * sanity_check(void) | |||
1325 | 788 | ||
1326 | static int check_header(void) | 789 | static int check_header(void) |
1327 | { | 790 | { |
1328 | const char * reason = NULL; | 791 | const char *reason = NULL; |
1329 | int error; | 792 | int error; |
1330 | 793 | ||
1331 | if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) | 794 | if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) |
@@ -1356,7 +819,7 @@ static int check_sig(void) | |||
1356 | * Reset swap signature now. | 819 | * Reset swap signature now. |
1357 | */ | 820 | */ |
1358 | error = bio_write_page(0, &swsusp_header); | 821 | error = bio_write_page(0, &swsusp_header); |
1359 | } else { | 822 | } else { |
1360 | return -EINVAL; | 823 | return -EINVAL; |
1361 | } | 824 | } |
1362 | if (!error) | 825 | if (!error) |
@@ -1373,7 +836,7 @@ static int check_sig(void) | |||
1373 | 836 | ||
1374 | static int data_read(struct pbe *pblist) | 837 | static int data_read(struct pbe *pblist) |
1375 | { | 838 | { |
1376 | struct pbe * p; | 839 | struct pbe *p; |
1377 | int error = 0; | 840 | int error = 0; |
1378 | int i = 0; | 841 | int i = 0; |
1379 | int mod = swsusp_info.image_pages / 100; | 842 | int mod = swsusp_info.image_pages / 100; |
@@ -1411,7 +874,7 @@ static int data_read(struct pbe *pblist) | |||
1411 | static int read_pagedir(struct pbe *pblist) | 874 | static int read_pagedir(struct pbe *pblist) |
1412 | { | 875 | { |
1413 | struct pbe *pbpage, *p; | 876 | struct pbe *pbpage, *p; |
1414 | unsigned i = 0; | 877 | unsigned int i = 0; |
1415 | int error; | 878 | int error; |
1416 | 879 | ||
1417 | if (!pblist) | 880 | if (!pblist) |
@@ -1433,10 +896,8 @@ static int read_pagedir(struct pbe *pblist) | |||
1433 | break; | 896 | break; |
1434 | } | 897 | } |
1435 | 898 | ||
1436 | if (error) | 899 | if (!error) |
1437 | free_page((unsigned long)pblist); | 900 | BUG_ON(i != swsusp_info.pagedir_pages); |
1438 | |||
1439 | BUG_ON(i != swsusp_info.pagedir_pages); | ||
1440 | 901 | ||
1441 | return error; | 902 | return error; |
1442 | } | 903 | } |
@@ -1460,32 +921,29 @@ static int read_suspend_image(void) | |||
1460 | int error = 0; | 921 | int error = 0; |
1461 | struct pbe *p; | 922 | struct pbe *p; |
1462 | 923 | ||
1463 | if (!(p = alloc_pagedir(nr_copy_pages))) | 924 | if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) |
1464 | return -ENOMEM; | 925 | return -ENOMEM; |
1465 | 926 | ||
1466 | if ((error = read_pagedir(p))) | 927 | if ((error = read_pagedir(p))) |
1467 | return error; | 928 | return error; |
1468 | |||
1469 | create_pbe_list(p, nr_copy_pages); | 929 | create_pbe_list(p, nr_copy_pages); |
1470 | 930 | mark_unsafe_pages(p); | |
1471 | if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) | 931 | pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); |
932 | if (pagedir_nosave) { | ||
933 | create_pbe_list(pagedir_nosave, nr_copy_pages); | ||
934 | copy_page_backup_list(pagedir_nosave, p); | ||
935 | } | ||
936 | free_pagedir(p); | ||
937 | if (!pagedir_nosave) | ||
1472 | return -ENOMEM; | 938 | return -ENOMEM; |
1473 | 939 | ||
1474 | /* Allocate memory for the image and read the data from swap */ | 940 | /* Allocate memory for the image and read the data from swap */ |
1475 | 941 | ||
1476 | error = check_pagedir(pagedir_nosave); | 942 | error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); |
1477 | free_eaten_memory(); | 943 | |
1478 | if (!error) | 944 | if (!error) |
1479 | error = data_read(pagedir_nosave); | 945 | error = data_read(pagedir_nosave); |
1480 | 946 | ||
1481 | if (error) { /* We fail cleanly */ | ||
1482 | for_each_pbe (p, pagedir_nosave) | ||
1483 | if (p->address) { | ||
1484 | free_page(p->address); | ||
1485 | p->address = 0UL; | ||
1486 | } | ||
1487 | free_pagedir(pagedir_nosave); | ||
1488 | } | ||
1489 | return error; | 947 | return error; |
1490 | } | 948 | } |
1491 | 949 | ||
diff --git a/kernel/printk.c b/kernel/printk.c index 4b8f0f9230a4..5287be83e3e7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -10,7 +10,7 @@ | |||
10 | * elsewhere, in preparation for a serial line console (someday). | 10 | * elsewhere, in preparation for a serial line console (someday). |
11 | * Ted Ts'o, 2/11/93. | 11 | * Ted Ts'o, 2/11/93. |
12 | * Modified for sysctl support, 1/8/97, Chris Horn. | 12 | * Modified for sysctl support, 1/8/97, Chris Horn. |
13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul | 13 | * Fixed SMP synchronization, 08/08/99, Manfred Spraul |
14 | * manfreds@colorfullife.com | 14 | * manfreds@colorfullife.com |
15 | * Rewrote bits to get rid of console_lock | 15 | * Rewrote bits to get rid of console_lock |
16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> | 16 | * 01Mar01 Andrew Morton <andrewm@uow.edu.au> |
@@ -148,7 +148,7 @@ static int __init console_setup(char *str) | |||
148 | if (!strcmp(str, "ttyb")) | 148 | if (!strcmp(str, "ttyb")) |
149 | strcpy(name, "ttyS1"); | 149 | strcpy(name, "ttyS1"); |
150 | #endif | 150 | #endif |
151 | for(s = name; *s; s++) | 151 | for (s = name; *s; s++) |
152 | if ((*s >= '0' && *s <= '9') || *s == ',') | 152 | if ((*s >= '0' && *s <= '9') || *s == ',') |
153 | break; | 153 | break; |
154 | idx = simple_strtoul(s, NULL, 10); | 154 | idx = simple_strtoul(s, NULL, 10); |
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str) | |||
169 | size = roundup_pow_of_two(size); | 169 | size = roundup_pow_of_two(size); |
170 | if (size > log_buf_len) { | 170 | if (size > log_buf_len) { |
171 | unsigned long start, dest_idx, offset; | 171 | unsigned long start, dest_idx, offset; |
172 | char * new_log_buf; | 172 | char *new_log_buf; |
173 | 173 | ||
174 | new_log_buf = alloc_bootmem(size); | 174 | new_log_buf = alloc_bootmem(size); |
175 | if (!new_log_buf) { | 175 | if (!new_log_buf) { |
176 | printk("log_buf_len: allocation failed\n"); | 176 | printk(KERN_WARNING "log_buf_len: allocation failed\n"); |
177 | goto out; | 177 | goto out; |
178 | } | 178 | } |
179 | 179 | ||
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str) | |||
193 | log_end -= offset; | 193 | log_end -= offset; |
194 | spin_unlock_irqrestore(&logbuf_lock, flags); | 194 | spin_unlock_irqrestore(&logbuf_lock, flags); |
195 | 195 | ||
196 | printk("log_buf_len: %d\n", log_buf_len); | 196 | printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); |
197 | } | 197 | } |
198 | out: | 198 | out: |
199 | |||
200 | return 1; | 199 | return 1; |
201 | } | 200 | } |
202 | 201 | ||
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup); | |||
217 | * 9 -- Return number of unread characters in the log buffer | 216 | * 9 -- Return number of unread characters in the log buffer |
218 | * 10 -- Return size of the log buffer | 217 | * 10 -- Return size of the log buffer |
219 | */ | 218 | */ |
220 | int do_syslog(int type, char __user * buf, int len) | 219 | int do_syslog(int type, char __user *buf, int len) |
221 | { | 220 | { |
222 | unsigned long i, j, limit, count; | 221 | unsigned long i, j, limit, count; |
223 | int do_clear = 0; | 222 | int do_clear = 0; |
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len) | |||
244 | error = -EFAULT; | 243 | error = -EFAULT; |
245 | goto out; | 244 | goto out; |
246 | } | 245 | } |
247 | error = wait_event_interruptible(log_wait, (log_start - log_end)); | 246 | error = wait_event_interruptible(log_wait, |
247 | (log_start - log_end)); | ||
248 | if (error) | 248 | if (error) |
249 | goto out; | 249 | goto out; |
250 | i = 0; | 250 | i = 0; |
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len) | |||
264 | error = i; | 264 | error = i; |
265 | break; | 265 | break; |
266 | case 4: /* Read/clear last kernel messages */ | 266 | case 4: /* Read/clear last kernel messages */ |
267 | do_clear = 1; | 267 | do_clear = 1; |
268 | /* FALL THRU */ | 268 | /* FALL THRU */ |
269 | case 3: /* Read last kernel messages */ | 269 | case 3: /* Read last kernel messages */ |
270 | error = -EINVAL; | 270 | error = -EINVAL; |
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len) | |||
288 | limit = log_end; | 288 | limit = log_end; |
289 | /* | 289 | /* |
290 | * __put_user() could sleep, and while we sleep | 290 | * __put_user() could sleep, and while we sleep |
291 | * printk() could overwrite the messages | 291 | * printk() could overwrite the messages |
292 | * we try to copy to user space. Therefore | 292 | * we try to copy to user space. Therefore |
293 | * the messages are copied in reverse. <manfreds> | 293 | * the messages are copied in reverse. <manfreds> |
294 | */ | 294 | */ |
295 | for(i = 0; i < count && !error; i++) { | 295 | for (i = 0; i < count && !error; i++) { |
296 | j = limit-1-i; | 296 | j = limit-1-i; |
297 | if (j + log_buf_len < log_end) | 297 | if (j + log_buf_len < log_end) |
298 | break; | 298 | break; |
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len) | |||
306 | if (error) | 306 | if (error) |
307 | break; | 307 | break; |
308 | error = i; | 308 | error = i; |
309 | if(i != count) { | 309 | if (i != count) { |
310 | int offset = count-error; | 310 | int offset = count-error; |
311 | /* buffer overflow during copy, correct user buffer. */ | 311 | /* buffer overflow during copy, correct user buffer. */ |
312 | for(i=0;i<error;i++) { | 312 | for (i = 0; i < error; i++) { |
313 | if (__get_user(c,&buf[i+offset]) || | 313 | if (__get_user(c,&buf[i+offset]) || |
314 | __put_user(c,&buf[i])) { | 314 | __put_user(c,&buf[i])) { |
315 | error = -EFAULT; | 315 | error = -EFAULT; |
@@ -351,7 +351,7 @@ out: | |||
351 | return error; | 351 | return error; |
352 | } | 352 | } |
353 | 353 | ||
354 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 354 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
355 | { | 355 | { |
356 | return do_syslog(type, buf, len); | 356 | return do_syslog(type, buf, len); |
357 | } | 357 | } |
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end) | |||
404 | cur_index = start; | 404 | cur_index = start; |
405 | start_print = start; | 405 | start_print = start; |
406 | while (cur_index != end) { | 406 | while (cur_index != end) { |
407 | if ( msg_level < 0 && | 407 | if (msg_level < 0 && ((end - cur_index) > 2) && |
408 | ((end - cur_index) > 2) && | 408 | LOG_BUF(cur_index + 0) == '<' && |
409 | LOG_BUF(cur_index + 0) == '<' && | 409 | LOG_BUF(cur_index + 1) >= '0' && |
410 | LOG_BUF(cur_index + 1) >= '0' && | 410 | LOG_BUF(cur_index + 1) <= '7' && |
411 | LOG_BUF(cur_index + 1) <= '7' && | 411 | LOG_BUF(cur_index + 2) == '>') { |
412 | LOG_BUF(cur_index + 2) == '>') | ||
413 | { | ||
414 | msg_level = LOG_BUF(cur_index + 1) - '0'; | 412 | msg_level = LOG_BUF(cur_index + 1) - '0'; |
415 | cur_index += 3; | 413 | cur_index += 3; |
416 | start_print = cur_index; | 414 | start_print = cur_index; |
417 | } | 415 | } |
418 | while (cur_index != end) { | 416 | while (cur_index != end) { |
419 | char c = LOG_BUF(cur_index); | 417 | char c = LOG_BUF(cur_index); |
420 | cur_index++; | ||
421 | 418 | ||
419 | cur_index++; | ||
422 | if (c == '\n') { | 420 | if (c == '\n') { |
423 | if (msg_level < 0) { | 421 | if (msg_level < 0) { |
424 | /* | 422 | /* |
@@ -461,7 +459,7 @@ static void zap_locks(void) | |||
461 | static unsigned long oops_timestamp; | 459 | static unsigned long oops_timestamp; |
462 | 460 | ||
463 | if (time_after_eq(jiffies, oops_timestamp) && | 461 | if (time_after_eq(jiffies, oops_timestamp) && |
464 | !time_after(jiffies, oops_timestamp + 30*HZ)) | 462 | !time_after(jiffies, oops_timestamp + 30 * HZ)) |
465 | return; | 463 | return; |
466 | 464 | ||
467 | oops_timestamp = jiffies; | 465 | oops_timestamp = jiffies; |
@@ -493,9 +491,12 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
493 | return sched_clock(); | 491 | return sched_clock(); |
494 | } | 492 | } |
495 | 493 | ||
496 | /* | 494 | /** |
495 | * printk - print a kernel message | ||
496 | * @fmt: format string | ||
497 | * | ||
497 | * This is printk. It can be called from any context. We want it to work. | 498 | * This is printk. It can be called from any context. We want it to work. |
498 | * | 499 | * |
499 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and | 500 | * We try to grab the console_sem. If we succeed, it's easy - we log the output and |
500 | * call the console drivers. If we fail to get the semaphore we place the output | 501 | * call the console drivers. If we fail to get the semaphore we place the output |
501 | * into the log buffer and return. The current holder of the console_sem will | 502 | * into the log buffer and return. The current holder of the console_sem will |
@@ -505,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void) | |||
505 | * One effect of this deferred printing is that code which calls printk() and | 506 | * One effect of this deferred printing is that code which calls printk() and |
506 | * then changes console_loglevel may break. This is because console_loglevel | 507 | * then changes console_loglevel may break. This is because console_loglevel |
507 | * is inspected when the actual printing occurs. | 508 | * is inspected when the actual printing occurs. |
509 | * | ||
510 | * See also: | ||
511 | * printf(3) | ||
508 | */ | 512 | */ |
509 | 513 | ||
510 | asmlinkage int printk(const char *fmt, ...) | 514 | asmlinkage int printk(const char *fmt, ...) |
@@ -639,18 +643,27 @@ EXPORT_SYMBOL(vprintk); | |||
639 | 643 | ||
640 | #else | 644 | #else |
641 | 645 | ||
642 | asmlinkage long sys_syslog(int type, char __user * buf, int len) | 646 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
643 | { | 647 | { |
644 | return 0; | 648 | return 0; |
645 | } | 649 | } |
646 | 650 | ||
647 | int do_syslog(int type, char __user * buf, int len) { return 0; } | 651 | int do_syslog(int type, char __user *buf, int len) |
648 | static void call_console_drivers(unsigned long start, unsigned long end) {} | 652 | { |
653 | return 0; | ||
654 | } | ||
655 | |||
656 | static void call_console_drivers(unsigned long start, unsigned long end) | ||
657 | { | ||
658 | } | ||
649 | 659 | ||
650 | #endif | 660 | #endif |
651 | 661 | ||
652 | /** | 662 | /** |
653 | * add_preferred_console - add a device to the list of preferred consoles. | 663 | * add_preferred_console - add a device to the list of preferred consoles. |
664 | * @name: device name | ||
665 | * @idx: device index | ||
666 | * @options: options for this console | ||
654 | * | 667 | * |
655 | * The last preferred console added will be used for kernel messages | 668 | * The last preferred console added will be used for kernel messages |
656 | * and stdin/out/err for init. Normally this is used by console_setup | 669 | * and stdin/out/err for init. Normally this is used by console_setup |
@@ -760,7 +773,8 @@ void release_console_sem(void) | |||
760 | } | 773 | } |
761 | EXPORT_SYMBOL(release_console_sem); | 774 | EXPORT_SYMBOL(release_console_sem); |
762 | 775 | ||
763 | /** console_conditional_schedule - yield the CPU if required | 776 | /** |
777 | * console_conditional_schedule - yield the CPU if required | ||
764 | * | 778 | * |
765 | * If the console code is currently allowed to sleep, and | 779 | * If the console code is currently allowed to sleep, and |
766 | * if this CPU should yield the CPU to another task, do | 780 | * if this CPU should yield the CPU to another task, do |
@@ -802,7 +816,6 @@ void console_unblank(void) | |||
802 | c->unblank(); | 816 | c->unblank(); |
803 | release_console_sem(); | 817 | release_console_sem(); |
804 | } | 818 | } |
805 | EXPORT_SYMBOL(console_unblank); | ||
806 | 819 | ||
807 | /* | 820 | /* |
808 | * Return the console tty driver structure and its associated index | 821 | * Return the console tty driver structure and its associated index |
@@ -851,9 +864,9 @@ EXPORT_SYMBOL(console_start); | |||
851 | * print any messages that were printed by the kernel before the | 864 | * print any messages that were printed by the kernel before the |
852 | * console driver was initialized. | 865 | * console driver was initialized. |
853 | */ | 866 | */ |
854 | void register_console(struct console * console) | 867 | void register_console(struct console *console) |
855 | { | 868 | { |
856 | int i; | 869 | int i; |
857 | unsigned long flags; | 870 | unsigned long flags; |
858 | 871 | ||
859 | if (preferred_console < 0) | 872 | if (preferred_console < 0) |
@@ -878,7 +891,8 @@ void register_console(struct console * console) | |||
878 | * See if this console matches one we selected on | 891 | * See if this console matches one we selected on |
879 | * the command line. | 892 | * the command line. |
880 | */ | 893 | */ |
881 | for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { | 894 | for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; |
895 | i++) { | ||
882 | if (strcmp(console_cmdline[i].name, console->name) != 0) | 896 | if (strcmp(console_cmdline[i].name, console->name) != 0) |
883 | continue; | 897 | continue; |
884 | if (console->index >= 0 && | 898 | if (console->index >= 0 && |
@@ -933,26 +947,26 @@ void register_console(struct console * console) | |||
933 | } | 947 | } |
934 | EXPORT_SYMBOL(register_console); | 948 | EXPORT_SYMBOL(register_console); |
935 | 949 | ||
936 | int unregister_console(struct console * console) | 950 | int unregister_console(struct console *console) |
937 | { | 951 | { |
938 | struct console *a,*b; | 952 | struct console *a, *b; |
939 | int res = 1; | 953 | int res = 1; |
940 | 954 | ||
941 | acquire_console_sem(); | 955 | acquire_console_sem(); |
942 | if (console_drivers == console) { | 956 | if (console_drivers == console) { |
943 | console_drivers=console->next; | 957 | console_drivers=console->next; |
944 | res = 0; | 958 | res = 0; |
945 | } else { | 959 | } else if (console_drivers) { |
946 | for (a=console_drivers->next, b=console_drivers ; | 960 | for (a=console_drivers->next, b=console_drivers ; |
947 | a; b=a, a=b->next) { | 961 | a; b=a, a=b->next) { |
948 | if (a == console) { | 962 | if (a == console) { |
949 | b->next = a->next; | 963 | b->next = a->next; |
950 | res = 0; | 964 | res = 0; |
951 | break; | 965 | break; |
952 | } | 966 | } |
953 | } | 967 | } |
954 | } | 968 | } |
955 | 969 | ||
956 | /* If last console is removed, we re-enable picking the first | 970 | /* If last console is removed, we re-enable picking the first |
957 | * one that gets registered. Without that, pmac early boot console | 971 | * one that gets registered. Without that, pmac early boot console |
958 | * would prevent fbcon from taking over. | 972 | * would prevent fbcon from taking over. |
@@ -972,6 +986,8 @@ EXPORT_SYMBOL(unregister_console); | |||
972 | 986 | ||
973 | /** | 987 | /** |
974 | * tty_write_message - write a message to a certain tty, not just the console. | 988 | * tty_write_message - write a message to a certain tty, not just the console. |
989 | * @tty: the destination tty_struct | ||
990 | * @msg: the message to write | ||
975 | * | 991 | * |
976 | * This is used for messages that need to be redirected to a specific tty. | 992 | * This is used for messages that need to be redirected to a specific tty. |
977 | * We don't put it into the syslog queue right now maybe in the future if | 993 | * We don't put it into the syslog queue right now maybe in the future if |
@@ -994,7 +1010,7 @@ void tty_write_message(struct tty_struct *tty, char *msg) | |||
994 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | 1010 | int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) |
995 | { | 1011 | { |
996 | static DEFINE_SPINLOCK(ratelimit_lock); | 1012 | static DEFINE_SPINLOCK(ratelimit_lock); |
997 | static unsigned long toks = 10*5*HZ; | 1013 | static unsigned long toks = 10 * 5 * HZ; |
998 | static unsigned long last_msg; | 1014 | static unsigned long last_msg; |
999 | static int missed; | 1015 | static int missed; |
1000 | unsigned long flags; | 1016 | unsigned long flags; |
@@ -1007,6 +1023,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
1007 | toks = ratelimit_burst * ratelimit_jiffies; | 1023 | toks = ratelimit_burst * ratelimit_jiffies; |
1008 | if (toks >= ratelimit_jiffies) { | 1024 | if (toks >= ratelimit_jiffies) { |
1009 | int lost = missed; | 1025 | int lost = missed; |
1026 | |||
1010 | missed = 0; | 1027 | missed = 0; |
1011 | toks -= ratelimit_jiffies; | 1028 | toks -= ratelimit_jiffies; |
1012 | spin_unlock_irqrestore(&ratelimit_lock, flags); | 1029 | spin_unlock_irqrestore(&ratelimit_lock, flags); |
@@ -1021,7 +1038,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) | |||
1021 | EXPORT_SYMBOL(__printk_ratelimit); | 1038 | EXPORT_SYMBOL(__printk_ratelimit); |
1022 | 1039 | ||
1023 | /* minimum time in jiffies between messages */ | 1040 | /* minimum time in jiffies between messages */ |
1024 | int printk_ratelimit_jiffies = 5*HZ; | 1041 | int printk_ratelimit_jiffies = 5 * HZ; |
1025 | 1042 | ||
1026 | /* number of messages we send before ratelimiting */ | 1043 | /* number of messages we send before ratelimiting */ |
1027 | int printk_ratelimit_burst = 10; | 1044 | int printk_ratelimit_burst = 10; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 019e04ec065a..656476eedb1b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child) | |||
56 | signal_wake_up(child, 1); | 56 | signal_wake_up(child, 1); |
57 | } | 57 | } |
58 | } | 58 | } |
59 | if (child->signal->flags & SIGNAL_GROUP_EXIT) { | ||
60 | sigaddset(&child->pending.signal, SIGKILL); | ||
61 | signal_wake_up(child, 1); | ||
62 | } | ||
59 | spin_unlock(&child->sighand->siglock); | 63 | spin_unlock(&child->sighand->siglock); |
60 | } | 64 | } |
61 | 65 | ||
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child) | |||
77 | SET_LINKS(child); | 81 | SET_LINKS(child); |
78 | } | 82 | } |
79 | 83 | ||
80 | if (child->state == TASK_TRACED) | 84 | ptrace_untrace(child); |
81 | ptrace_untrace(child); | ||
82 | } | 85 | } |
83 | 86 | ||
84 | /* | 87 | /* |
@@ -152,7 +155,7 @@ int ptrace_attach(struct task_struct *task) | |||
152 | retval = -EPERM; | 155 | retval = -EPERM; |
153 | if (task->pid <= 1) | 156 | if (task->pid <= 1) |
154 | goto bad; | 157 | goto bad; |
155 | if (task == current) | 158 | if (task->tgid == current->tgid) |
156 | goto bad; | 159 | goto bad; |
157 | /* the same process cannot be attached many times */ | 160 | /* the same process cannot be attached many times */ |
158 | if (task->ptrace & PT_PTRACED) | 161 | if (task->ptrace & PT_PTRACED) |
@@ -238,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
238 | if (write) { | 241 | if (write) { |
239 | copy_to_user_page(vma, page, addr, | 242 | copy_to_user_page(vma, page, addr, |
240 | maddr + offset, buf, bytes); | 243 | maddr + offset, buf, bytes); |
241 | set_page_dirty_lock(page); | 244 | if (!PageCompound(page)) |
245 | set_page_dirty_lock(page); | ||
242 | } else { | 246 | } else { |
243 | copy_from_user_page(vma, page, addr, | 247 | copy_from_user_page(vma, page, addr, |
244 | buf, maddr + offset, bytes); | 248 | buf, maddr + offset, bytes); |
@@ -403,3 +407,85 @@ int ptrace_request(struct task_struct *child, long request, | |||
403 | 407 | ||
404 | return ret; | 408 | return ret; |
405 | } | 409 | } |
410 | |||
411 | #ifndef __ARCH_SYS_PTRACE | ||
412 | static int ptrace_get_task_struct(long request, long pid, | ||
413 | struct task_struct **childp) | ||
414 | { | ||
415 | struct task_struct *child; | ||
416 | int ret; | ||
417 | |||
418 | /* | ||
419 | * Callers use child == NULL as an indication to exit early even | ||
420 | * when the return value is 0, so make sure it is non-NULL here. | ||
421 | */ | ||
422 | *childp = NULL; | ||
423 | |||
424 | if (request == PTRACE_TRACEME) { | ||
425 | /* | ||
426 | * Are we already being traced? | ||
427 | */ | ||
428 | if (current->ptrace & PT_PTRACED) | ||
429 | return -EPERM; | ||
430 | ret = security_ptrace(current->parent, current); | ||
431 | if (ret) | ||
432 | return -EPERM; | ||
433 | /* | ||
434 | * Set the ptrace bit in the process ptrace flags. | ||
435 | */ | ||
436 | current->ptrace |= PT_PTRACED; | ||
437 | return 0; | ||
438 | } | ||
439 | |||
440 | /* | ||
441 | * You may not mess with init | ||
442 | */ | ||
443 | if (pid == 1) | ||
444 | return -EPERM; | ||
445 | |||
446 | ret = -ESRCH; | ||
447 | read_lock(&tasklist_lock); | ||
448 | child = find_task_by_pid(pid); | ||
449 | if (child) | ||
450 | get_task_struct(child); | ||
451 | read_unlock(&tasklist_lock); | ||
452 | if (!child) | ||
453 | return -ESRCH; | ||
454 | |||
455 | *childp = child; | ||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | ||
460 | { | ||
461 | struct task_struct *child; | ||
462 | long ret; | ||
463 | |||
464 | /* | ||
465 | * This lock_kernel fixes a subtle race with suid exec | ||
466 | */ | ||
467 | lock_kernel(); | ||
468 | ret = ptrace_get_task_struct(request, pid, &child); | ||
469 | if (!child) | ||
470 | goto out; | ||
471 | |||
472 | if (request == PTRACE_ATTACH) { | ||
473 | ret = ptrace_attach(child); | ||
474 | goto out_put_task_struct; | ||
475 | } | ||
476 | |||
477 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
478 | if (ret < 0) | ||
479 | goto out_put_task_struct; | ||
480 | |||
481 | ret = arch_ptrace(child, request, addr, data); | ||
482 | if (ret < 0) | ||
483 | goto out_put_task_struct; | ||
484 | |||
485 | out_put_task_struct: | ||
486 | put_task_struct(child); | ||
487 | out: | ||
488 | unlock_kernel(); | ||
489 | return ret; | ||
490 | } | ||
491 | #endif /* __ARCH_SYS_PTRACE */ | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index bef3b6901b76..c4d159a21e04 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -71,7 +71,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
71 | 71 | ||
72 | /* Fake initialization required by compiler */ | 72 | /* Fake initialization required by compiler */ |
73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
74 | static int maxbatch = 10; | 74 | static int maxbatch = 10000; |
75 | 75 | ||
76 | #ifndef __HAVE_ARCH_CMPXCHG | 76 | #ifndef __HAVE_ARCH_CMPXCHG |
77 | /* | 77 | /* |
@@ -109,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head, | |||
109 | rdp = &__get_cpu_var(rcu_data); | 109 | rdp = &__get_cpu_var(rcu_data); |
110 | *rdp->nxttail = head; | 110 | *rdp->nxttail = head; |
111 | rdp->nxttail = &head->next; | 111 | rdp->nxttail = &head->next; |
112 | |||
113 | if (unlikely(++rdp->count > 10000)) | ||
114 | set_need_resched(); | ||
115 | |||
112 | local_irq_restore(flags); | 116 | local_irq_restore(flags); |
113 | } | 117 | } |
114 | 118 | ||
@@ -140,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head, | |||
140 | rdp = &__get_cpu_var(rcu_bh_data); | 144 | rdp = &__get_cpu_var(rcu_bh_data); |
141 | *rdp->nxttail = head; | 145 | *rdp->nxttail = head; |
142 | rdp->nxttail = &head->next; | 146 | rdp->nxttail = &head->next; |
147 | rdp->count++; | ||
148 | /* | ||
149 | * Should we directly call rcu_do_batch() here ? | ||
150 | * if (unlikely(rdp->count > 10000)) | ||
151 | * rcu_do_batch(rdp); | ||
152 | */ | ||
143 | local_irq_restore(flags); | 153 | local_irq_restore(flags); |
144 | } | 154 | } |
145 | 155 | ||
146 | /* | 156 | /* |
157 | * Return the number of RCU batches processed thus far. Useful | ||
158 | * for debug and statistics. | ||
159 | */ | ||
160 | long rcu_batches_completed(void) | ||
161 | { | ||
162 | return rcu_ctrlblk.completed; | ||
163 | } | ||
164 | |||
165 | /* | ||
147 | * Invoke the completed RCU callbacks. They are expected to be in | 166 | * Invoke the completed RCU callbacks. They are expected to be in |
148 | * a per-cpu list. | 167 | * a per-cpu list. |
149 | */ | 168 | */ |
@@ -157,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
157 | next = rdp->donelist = list->next; | 176 | next = rdp->donelist = list->next; |
158 | list->func(list); | 177 | list->func(list); |
159 | list = next; | 178 | list = next; |
179 | rdp->count--; | ||
160 | if (++count >= maxbatch) | 180 | if (++count >= maxbatch) |
161 | break; | 181 | break; |
162 | } | 182 | } |
@@ -490,6 +510,7 @@ void synchronize_kernel(void) | |||
490 | } | 510 | } |
491 | 511 | ||
492 | module_param(maxbatch, int, 0); | 512 | module_param(maxbatch, int, 0); |
513 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
493 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ | 514 | EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ |
494 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ | 515 | EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ |
495 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 516 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 000000000000..88c28d476550 --- /dev/null +++ b/kernel/rcutorture.c | |||
@@ -0,0 +1,514 @@ | |||
1 | /* | ||
2 | * Read-Copy Update /proc-based torture test facility | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright (C) IBM Corporation, 2005 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * | ||
22 | * See also: Documentation/RCU/torture.txt | ||
23 | */ | ||
24 | #include <linux/types.h> | ||
25 | #include <linux/kernel.h> | ||
26 | #include <linux/init.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kthread.h> | ||
29 | #include <linux/err.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/smp.h> | ||
32 | #include <linux/rcupdate.h> | ||
33 | #include <linux/interrupt.h> | ||
34 | #include <linux/sched.h> | ||
35 | #include <asm/atomic.h> | ||
36 | #include <linux/bitops.h> | ||
37 | #include <linux/module.h> | ||
38 | #include <linux/completion.h> | ||
39 | #include <linux/moduleparam.h> | ||
40 | #include <linux/percpu.h> | ||
41 | #include <linux/notifier.h> | ||
42 | #include <linux/rcuref.h> | ||
43 | #include <linux/cpu.h> | ||
44 | #include <linux/random.h> | ||
45 | #include <linux/delay.h> | ||
46 | #include <linux/byteorder/swabb.h> | ||
47 | #include <linux/stat.h> | ||
48 | |||
49 | MODULE_LICENSE("GPL"); | ||
50 | |||
51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | ||
52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | ||
53 | /* Defaults to "only at end of test". */ | ||
54 | static int verbose = 0; /* Print more debug info. */ | ||
55 | |||
56 | MODULE_PARM(nreaders, "i"); | ||
57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
58 | MODULE_PARM(stat_interval, "i"); | ||
59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
60 | MODULE_PARM(verbose, "i"); | ||
61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
62 | #define TORTURE_FLAG "rcutorture: " | ||
63 | #define PRINTK_STRING(s) \ | ||
64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
65 | #define VERBOSE_PRINTK_STRING(s) \ | ||
66 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | ||
67 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | ||
68 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | ||
69 | |||
70 | static char printk_buf[4096]; | ||
71 | |||
72 | static int nrealreaders; | ||
73 | static struct task_struct *writer_task; | ||
74 | static struct task_struct **reader_tasks; | ||
75 | static struct task_struct *stats_task; | ||
76 | |||
77 | #define RCU_TORTURE_PIPE_LEN 10 | ||
78 | |||
79 | struct rcu_torture { | ||
80 | struct rcu_head rtort_rcu; | ||
81 | int rtort_pipe_count; | ||
82 | struct list_head rtort_free; | ||
83 | int rtort_mbtest; | ||
84 | }; | ||
85 | |||
86 | static int fullstop = 0; /* stop generating callbacks at test end. */ | ||
87 | static LIST_HEAD(rcu_torture_freelist); | ||
88 | static struct rcu_torture *rcu_torture_current = NULL; | ||
89 | static long rcu_torture_current_version = 0; | ||
90 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | ||
91 | static DEFINE_SPINLOCK(rcu_torture_lock); | ||
92 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | ||
93 | { 0 }; | ||
94 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | ||
95 | { 0 }; | ||
96 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | ||
97 | atomic_t n_rcu_torture_alloc; | ||
98 | atomic_t n_rcu_torture_alloc_fail; | ||
99 | atomic_t n_rcu_torture_free; | ||
100 | atomic_t n_rcu_torture_mberror; | ||
101 | atomic_t n_rcu_torture_error; | ||
102 | |||
103 | /* | ||
104 | * Allocate an element from the rcu_tortures pool. | ||
105 | */ | ||
106 | struct rcu_torture * | ||
107 | rcu_torture_alloc(void) | ||
108 | { | ||
109 | struct list_head *p; | ||
110 | |||
111 | spin_lock(&rcu_torture_lock); | ||
112 | if (list_empty(&rcu_torture_freelist)) { | ||
113 | atomic_inc(&n_rcu_torture_alloc_fail); | ||
114 | spin_unlock(&rcu_torture_lock); | ||
115 | return NULL; | ||
116 | } | ||
117 | atomic_inc(&n_rcu_torture_alloc); | ||
118 | p = rcu_torture_freelist.next; | ||
119 | list_del_init(p); | ||
120 | spin_unlock(&rcu_torture_lock); | ||
121 | return container_of(p, struct rcu_torture, rtort_free); | ||
122 | } | ||
123 | |||
124 | /* | ||
125 | * Free an element to the rcu_tortures pool. | ||
126 | */ | ||
127 | static void | ||
128 | rcu_torture_free(struct rcu_torture *p) | ||
129 | { | ||
130 | atomic_inc(&n_rcu_torture_free); | ||
131 | spin_lock(&rcu_torture_lock); | ||
132 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | ||
133 | spin_unlock(&rcu_torture_lock); | ||
134 | } | ||
135 | |||
136 | static void | ||
137 | rcu_torture_cb(struct rcu_head *p) | ||
138 | { | ||
139 | int i; | ||
140 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
141 | |||
142 | if (fullstop) { | ||
143 | /* Test is ending, just drop callbacks on the floor. */ | ||
144 | /* The next initialization will pick up the pieces. */ | ||
145 | return; | ||
146 | } | ||
147 | i = rp->rtort_pipe_count; | ||
148 | if (i > RCU_TORTURE_PIPE_LEN) | ||
149 | i = RCU_TORTURE_PIPE_LEN; | ||
150 | atomic_inc(&rcu_torture_wcount[i]); | ||
151 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
152 | rp->rtort_mbtest = 0; | ||
153 | rcu_torture_free(rp); | ||
154 | } else | ||
155 | call_rcu(p, rcu_torture_cb); | ||
156 | } | ||
157 | |||
158 | struct rcu_random_state { | ||
159 | unsigned long rrs_state; | ||
160 | unsigned long rrs_count; | ||
161 | }; | ||
162 | |||
163 | #define RCU_RANDOM_MULT 39916801 /* prime */ | ||
164 | #define RCU_RANDOM_ADD 479001701 /* prime */ | ||
165 | #define RCU_RANDOM_REFRESH 10000 | ||
166 | |||
167 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | ||
168 | |||
169 | /* | ||
170 | * Crude but fast random-number generator. Uses a linear congruential | ||
171 | * generator, with occasional help from get_random_bytes(). | ||
172 | */ | ||
173 | static long | ||
174 | rcu_random(struct rcu_random_state *rrsp) | ||
175 | { | ||
176 | long refresh; | ||
177 | |||
178 | if (--rrsp->rrs_count < 0) { | ||
179 | get_random_bytes(&refresh, sizeof(refresh)); | ||
180 | rrsp->rrs_state += refresh; | ||
181 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | ||
182 | } | ||
183 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | ||
184 | return swahw32(rrsp->rrs_state); | ||
185 | } | ||
186 | |||
187 | /* | ||
188 | * RCU torture writer kthread. Repeatedly substitutes a new structure | ||
189 | * for that pointed to by rcu_torture_current, freeing the old structure | ||
190 | * after a series of grace periods (the "pipeline"). | ||
191 | */ | ||
192 | static int | ||
193 | rcu_torture_writer(void *arg) | ||
194 | { | ||
195 | int i; | ||
196 | long oldbatch = rcu_batches_completed(); | ||
197 | struct rcu_torture *rp; | ||
198 | struct rcu_torture *old_rp; | ||
199 | static DEFINE_RCU_RANDOM(rand); | ||
200 | |||
201 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | ||
202 | set_user_nice(current, 19); | ||
203 | |||
204 | do { | ||
205 | schedule_timeout_uninterruptible(1); | ||
206 | if (rcu_batches_completed() == oldbatch) | ||
207 | continue; | ||
208 | if ((rp = rcu_torture_alloc()) == NULL) | ||
209 | continue; | ||
210 | rp->rtort_pipe_count = 0; | ||
211 | udelay(rcu_random(&rand) & 0x3ff); | ||
212 | old_rp = rcu_torture_current; | ||
213 | rp->rtort_mbtest = 1; | ||
214 | rcu_assign_pointer(rcu_torture_current, rp); | ||
215 | smp_wmb(); | ||
216 | if (old_rp != NULL) { | ||
217 | i = old_rp->rtort_pipe_count; | ||
218 | if (i > RCU_TORTURE_PIPE_LEN) | ||
219 | i = RCU_TORTURE_PIPE_LEN; | ||
220 | atomic_inc(&rcu_torture_wcount[i]); | ||
221 | old_rp->rtort_pipe_count++; | ||
222 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | ||
223 | } | ||
224 | rcu_torture_current_version++; | ||
225 | oldbatch = rcu_batches_completed(); | ||
226 | } while (!kthread_should_stop() && !fullstop); | ||
227 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | ||
228 | while (!kthread_should_stop()) | ||
229 | schedule_timeout_uninterruptible(1); | ||
230 | return 0; | ||
231 | } | ||
232 | |||
233 | /* | ||
234 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | ||
235 | * incrementing the corresponding element of the pipeline array. The | ||
236 | * counter in the element should never be greater than 1, otherwise, the | ||
237 | * RCU implementation is broken. | ||
238 | */ | ||
239 | static int | ||
240 | rcu_torture_reader(void *arg) | ||
241 | { | ||
242 | int completed; | ||
243 | DEFINE_RCU_RANDOM(rand); | ||
244 | struct rcu_torture *p; | ||
245 | int pipe_count; | ||
246 | |||
247 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | ||
248 | set_user_nice(current, 19); | ||
249 | |||
250 | do { | ||
251 | rcu_read_lock(); | ||
252 | completed = rcu_batches_completed(); | ||
253 | p = rcu_dereference(rcu_torture_current); | ||
254 | if (p == NULL) { | ||
255 | /* Wait for rcu_torture_writer to get underway */ | ||
256 | rcu_read_unlock(); | ||
257 | schedule_timeout_interruptible(HZ); | ||
258 | continue; | ||
259 | } | ||
260 | if (p->rtort_mbtest == 0) | ||
261 | atomic_inc(&n_rcu_torture_mberror); | ||
262 | udelay(rcu_random(&rand) & 0x7f); | ||
263 | preempt_disable(); | ||
264 | pipe_count = p->rtort_pipe_count; | ||
265 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | ||
266 | /* Should not happen, but... */ | ||
267 | pipe_count = RCU_TORTURE_PIPE_LEN; | ||
268 | } | ||
269 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | ||
270 | completed = rcu_batches_completed() - completed; | ||
271 | if (completed > RCU_TORTURE_PIPE_LEN) { | ||
272 | /* Should not happen, but... */ | ||
273 | completed = RCU_TORTURE_PIPE_LEN; | ||
274 | } | ||
275 | ++__get_cpu_var(rcu_torture_batch)[completed]; | ||
276 | preempt_enable(); | ||
277 | rcu_read_unlock(); | ||
278 | schedule(); | ||
279 | } while (!kthread_should_stop() && !fullstop); | ||
280 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | ||
281 | while (!kthread_should_stop()) | ||
282 | schedule_timeout_uninterruptible(1); | ||
283 | return 0; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Create an RCU-torture statistics message in the specified buffer. | ||
288 | */ | ||
289 | static int | ||
290 | rcu_torture_printk(char *page) | ||
291 | { | ||
292 | int cnt = 0; | ||
293 | int cpu; | ||
294 | int i; | ||
295 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
296 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
297 | |||
298 | for_each_cpu(cpu) { | ||
299 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
300 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | ||
301 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | ||
302 | } | ||
303 | } | ||
304 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | ||
305 | if (pipesummary[i] != 0) | ||
306 | break; | ||
307 | } | ||
308 | cnt += sprintf(&page[cnt], "rcutorture: "); | ||
309 | cnt += sprintf(&page[cnt], | ||
310 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | ||
311 | "rtmbe: %d", | ||
312 | rcu_torture_current, | ||
313 | rcu_torture_current_version, | ||
314 | list_empty(&rcu_torture_freelist), | ||
315 | atomic_read(&n_rcu_torture_alloc), | ||
316 | atomic_read(&n_rcu_torture_alloc_fail), | ||
317 | atomic_read(&n_rcu_torture_free), | ||
318 | atomic_read(&n_rcu_torture_mberror)); | ||
319 | if (atomic_read(&n_rcu_torture_mberror) != 0) | ||
320 | cnt += sprintf(&page[cnt], " !!!"); | ||
321 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
322 | if (i > 1) { | ||
323 | cnt += sprintf(&page[cnt], "!!! "); | ||
324 | atomic_inc(&n_rcu_torture_error); | ||
325 | } | ||
326 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | ||
327 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
328 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | ||
329 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
330 | cnt += sprintf(&page[cnt], "Reader Batch: "); | ||
331 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
332 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | ||
333 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | ||
334 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | ||
335 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
336 | cnt += sprintf(&page[cnt], " %d", | ||
337 | atomic_read(&rcu_torture_wcount[i])); | ||
338 | } | ||
339 | cnt += sprintf(&page[cnt], "\n"); | ||
340 | return cnt; | ||
341 | } | ||
342 | |||
343 | /* | ||
344 | * Print torture statistics. Caller must ensure that there is only | ||
345 | * one call to this function at a given time!!! This is normally | ||
346 | * accomplished by relying on the module system to only have one copy | ||
347 | * of the module loaded, and then by giving the rcu_torture_stats | ||
348 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
349 | * thread is not running). | ||
350 | */ | ||
351 | static void | ||
352 | rcu_torture_stats_print(void) | ||
353 | { | ||
354 | int cnt; | ||
355 | |||
356 | cnt = rcu_torture_printk(printk_buf); | ||
357 | printk(KERN_ALERT "%s", printk_buf); | ||
358 | } | ||
359 | |||
360 | /* | ||
361 | * Periodically prints torture statistics, if periodic statistics printing | ||
362 | * was specified via the stat_interval module parameter. | ||
363 | * | ||
364 | * No need to worry about fullstop here, since this one doesn't reference | ||
365 | * volatile state or register callbacks. | ||
366 | */ | ||
367 | static int | ||
368 | rcu_torture_stats(void *arg) | ||
369 | { | ||
370 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | ||
371 | do { | ||
372 | schedule_timeout_interruptible(stat_interval * HZ); | ||
373 | rcu_torture_stats_print(); | ||
374 | } while (!kthread_should_stop()); | ||
375 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | ||
376 | return 0; | ||
377 | } | ||
378 | |||
379 | static void | ||
380 | rcu_torture_cleanup(void) | ||
381 | { | ||
382 | int i; | ||
383 | |||
384 | fullstop = 1; | ||
385 | if (writer_task != NULL) { | ||
386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | ||
387 | kthread_stop(writer_task); | ||
388 | } | ||
389 | writer_task = NULL; | ||
390 | |||
391 | if (reader_tasks != NULL) { | ||
392 | for (i = 0; i < nrealreaders; i++) { | ||
393 | if (reader_tasks[i] != NULL) { | ||
394 | VERBOSE_PRINTK_STRING( | ||
395 | "Stopping rcu_torture_reader task"); | ||
396 | kthread_stop(reader_tasks[i]); | ||
397 | } | ||
398 | reader_tasks[i] = NULL; | ||
399 | } | ||
400 | kfree(reader_tasks); | ||
401 | reader_tasks = NULL; | ||
402 | } | ||
403 | rcu_torture_current = NULL; | ||
404 | |||
405 | if (stats_task != NULL) { | ||
406 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | ||
407 | kthread_stop(stats_task); | ||
408 | } | ||
409 | stats_task = NULL; | ||
410 | |||
411 | /* Wait for all RCU callbacks to fire. */ | ||
412 | |||
413 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | ||
414 | synchronize_rcu(); | ||
415 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | ||
416 | printk(KERN_ALERT TORTURE_FLAG | ||
417 | "--- End of test: %s\n", | ||
418 | atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE"); | ||
419 | } | ||
420 | |||
421 | static int | ||
422 | rcu_torture_init(void) | ||
423 | { | ||
424 | int i; | ||
425 | int cpu; | ||
426 | int firsterr = 0; | ||
427 | |||
428 | /* Process args and tell the world that the torturer is on the job. */ | ||
429 | |||
430 | if (nreaders >= 0) | ||
431 | nrealreaders = nreaders; | ||
432 | else | ||
433 | nrealreaders = 2 * num_online_cpus(); | ||
434 | printk(KERN_ALERT TORTURE_FLAG | ||
435 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | ||
436 | nrealreaders, stat_interval, verbose); | ||
437 | fullstop = 0; | ||
438 | |||
439 | /* Set up the freelist. */ | ||
440 | |||
441 | INIT_LIST_HEAD(&rcu_torture_freelist); | ||
442 | for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) { | ||
443 | rcu_tortures[i].rtort_mbtest = 0; | ||
444 | list_add_tail(&rcu_tortures[i].rtort_free, | ||
445 | &rcu_torture_freelist); | ||
446 | } | ||
447 | |||
448 | /* Initialize the statistics so that each run gets its own numbers. */ | ||
449 | |||
450 | rcu_torture_current = NULL; | ||
451 | rcu_torture_current_version = 0; | ||
452 | atomic_set(&n_rcu_torture_alloc, 0); | ||
453 | atomic_set(&n_rcu_torture_alloc_fail, 0); | ||
454 | atomic_set(&n_rcu_torture_free, 0); | ||
455 | atomic_set(&n_rcu_torture_mberror, 0); | ||
456 | atomic_set(&n_rcu_torture_error, 0); | ||
457 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
458 | atomic_set(&rcu_torture_wcount[i], 0); | ||
459 | for_each_cpu(cpu) { | ||
460 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
461 | per_cpu(rcu_torture_count, cpu)[i] = 0; | ||
462 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | ||
463 | } | ||
464 | } | ||
465 | |||
466 | /* Start up the kthreads. */ | ||
467 | |||
468 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | ||
469 | writer_task = kthread_run(rcu_torture_writer, NULL, | ||
470 | "rcu_torture_writer"); | ||
471 | if (IS_ERR(writer_task)) { | ||
472 | firsterr = PTR_ERR(writer_task); | ||
473 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | ||
474 | writer_task = NULL; | ||
475 | goto unwind; | ||
476 | } | ||
477 | reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]), | ||
478 | GFP_KERNEL); | ||
479 | if (reader_tasks == NULL) { | ||
480 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | ||
481 | firsterr = -ENOMEM; | ||
482 | goto unwind; | ||
483 | } | ||
484 | for (i = 0; i < nrealreaders; i++) { | ||
485 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | ||
486 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | ||
487 | "rcu_torture_reader"); | ||
488 | if (IS_ERR(reader_tasks[i])) { | ||
489 | firsterr = PTR_ERR(reader_tasks[i]); | ||
490 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | ||
491 | reader_tasks[i] = NULL; | ||
492 | goto unwind; | ||
493 | } | ||
494 | } | ||
495 | if (stat_interval > 0) { | ||
496 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | ||
497 | stats_task = kthread_run(rcu_torture_stats, NULL, | ||
498 | "rcu_torture_stats"); | ||
499 | if (IS_ERR(stats_task)) { | ||
500 | firsterr = PTR_ERR(stats_task); | ||
501 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | ||
502 | stats_task = NULL; | ||
503 | goto unwind; | ||
504 | } | ||
505 | } | ||
506 | return 0; | ||
507 | |||
508 | unwind: | ||
509 | rcu_torture_cleanup(); | ||
510 | return firsterr; | ||
511 | } | ||
512 | |||
513 | module_init(rcu_torture_init); | ||
514 | module_exit(rcu_torture_cleanup); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 1f31a528fdba..6f46c94cc29e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -206,6 +206,7 @@ struct runqueue { | |||
206 | */ | 206 | */ |
207 | unsigned long nr_running; | 207 | unsigned long nr_running; |
208 | #ifdef CONFIG_SMP | 208 | #ifdef CONFIG_SMP |
209 | unsigned long prio_bias; | ||
209 | unsigned long cpu_load[3]; | 210 | unsigned long cpu_load[3]; |
210 | #endif | 211 | #endif |
211 | unsigned long long nr_switches; | 212 | unsigned long long nr_switches; |
@@ -659,13 +660,68 @@ static int effective_prio(task_t *p) | |||
659 | return prio; | 660 | return prio; |
660 | } | 661 | } |
661 | 662 | ||
663 | #ifdef CONFIG_SMP | ||
664 | static inline void inc_prio_bias(runqueue_t *rq, int prio) | ||
665 | { | ||
666 | rq->prio_bias += MAX_PRIO - prio; | ||
667 | } | ||
668 | |||
669 | static inline void dec_prio_bias(runqueue_t *rq, int prio) | ||
670 | { | ||
671 | rq->prio_bias -= MAX_PRIO - prio; | ||
672 | } | ||
673 | |||
674 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
675 | { | ||
676 | rq->nr_running++; | ||
677 | if (rt_task(p)) { | ||
678 | if (p != rq->migration_thread) | ||
679 | /* | ||
680 | * The migration thread does the actual balancing. Do | ||
681 | * not bias by its priority as the ultra high priority | ||
682 | * will skew balancing adversely. | ||
683 | */ | ||
684 | inc_prio_bias(rq, p->prio); | ||
685 | } else | ||
686 | inc_prio_bias(rq, p->static_prio); | ||
687 | } | ||
688 | |||
689 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
690 | { | ||
691 | rq->nr_running--; | ||
692 | if (rt_task(p)) { | ||
693 | if (p != rq->migration_thread) | ||
694 | dec_prio_bias(rq, p->prio); | ||
695 | } else | ||
696 | dec_prio_bias(rq, p->static_prio); | ||
697 | } | ||
698 | #else | ||
699 | static inline void inc_prio_bias(runqueue_t *rq, int prio) | ||
700 | { | ||
701 | } | ||
702 | |||
703 | static inline void dec_prio_bias(runqueue_t *rq, int prio) | ||
704 | { | ||
705 | } | ||
706 | |||
707 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
708 | { | ||
709 | rq->nr_running++; | ||
710 | } | ||
711 | |||
712 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
713 | { | ||
714 | rq->nr_running--; | ||
715 | } | ||
716 | #endif | ||
717 | |||
662 | /* | 718 | /* |
663 | * __activate_task - move a task to the runqueue. | 719 | * __activate_task - move a task to the runqueue. |
664 | */ | 720 | */ |
665 | static inline void __activate_task(task_t *p, runqueue_t *rq) | 721 | static inline void __activate_task(task_t *p, runqueue_t *rq) |
666 | { | 722 | { |
667 | enqueue_task(p, rq->active); | 723 | enqueue_task(p, rq->active); |
668 | rq->nr_running++; | 724 | inc_nr_running(p, rq); |
669 | } | 725 | } |
670 | 726 | ||
671 | /* | 727 | /* |
@@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) | |||
674 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 730 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
675 | { | 731 | { |
676 | enqueue_task_head(p, rq->active); | 732 | enqueue_task_head(p, rq->active); |
677 | rq->nr_running++; | 733 | inc_nr_running(p, rq); |
678 | } | 734 | } |
679 | 735 | ||
680 | static int recalc_task_prio(task_t *p, unsigned long long now) | 736 | static int recalc_task_prio(task_t *p, unsigned long long now) |
@@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
759 | } | 815 | } |
760 | #endif | 816 | #endif |
761 | 817 | ||
762 | p->prio = recalc_task_prio(p, now); | 818 | if (!rt_task(p)) |
819 | p->prio = recalc_task_prio(p, now); | ||
763 | 820 | ||
764 | /* | 821 | /* |
765 | * This checks to make sure it's not an uninterruptible task | 822 | * This checks to make sure it's not an uninterruptible task |
@@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
793 | */ | 850 | */ |
794 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 851 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
795 | { | 852 | { |
796 | rq->nr_running--; | 853 | dec_nr_running(p, rq); |
797 | dequeue_task(p, p->array); | 854 | dequeue_task(p, p->array); |
798 | p->array = NULL; | 855 | p->array = NULL; |
799 | } | 856 | } |
@@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
808 | #ifdef CONFIG_SMP | 865 | #ifdef CONFIG_SMP |
809 | static void resched_task(task_t *p) | 866 | static void resched_task(task_t *p) |
810 | { | 867 | { |
811 | int need_resched, nrpolling; | 868 | int cpu; |
812 | 869 | ||
813 | assert_spin_locked(&task_rq(p)->lock); | 870 | assert_spin_locked(&task_rq(p)->lock); |
814 | 871 | ||
815 | /* minimise the chance of sending an interrupt to poll_idle() */ | 872 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
816 | nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); | 873 | return; |
817 | need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); | 874 | |
818 | nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); | 875 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
876 | |||
877 | cpu = task_cpu(p); | ||
878 | if (cpu == smp_processor_id()) | ||
879 | return; | ||
819 | 880 | ||
820 | if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) | 881 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ |
821 | smp_send_reschedule(task_cpu(p)); | 882 | smp_mb(); |
883 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | ||
884 | smp_send_reschedule(cpu); | ||
822 | } | 885 | } |
823 | #else | 886 | #else |
824 | static inline void resched_task(task_t *p) | 887 | static inline void resched_task(task_t *p) |
825 | { | 888 | { |
889 | assert_spin_locked(&task_rq(p)->lock); | ||
826 | set_tsk_need_resched(p); | 890 | set_tsk_need_resched(p); |
827 | } | 891 | } |
828 | #endif | 892 | #endif |
@@ -930,27 +994,61 @@ void kick_process(task_t *p) | |||
930 | * We want to under-estimate the load of migration sources, to | 994 | * We want to under-estimate the load of migration sources, to |
931 | * balance conservatively. | 995 | * balance conservatively. |
932 | */ | 996 | */ |
933 | static inline unsigned long source_load(int cpu, int type) | 997 | static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) |
934 | { | 998 | { |
935 | runqueue_t *rq = cpu_rq(cpu); | 999 | runqueue_t *rq = cpu_rq(cpu); |
936 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1000 | unsigned long running = rq->nr_running; |
1001 | unsigned long source_load, cpu_load = rq->cpu_load[type-1], | ||
1002 | load_now = running * SCHED_LOAD_SCALE; | ||
1003 | |||
937 | if (type == 0) | 1004 | if (type == 0) |
938 | return load_now; | 1005 | source_load = load_now; |
1006 | else | ||
1007 | source_load = min(cpu_load, load_now); | ||
1008 | |||
1009 | if (running > 1 || (idle == NOT_IDLE && running)) | ||
1010 | /* | ||
1011 | * If we are busy rebalancing the load is biased by | ||
1012 | * priority to create 'nice' support across cpus. When | ||
1013 | * idle rebalancing we should only bias the source_load if | ||
1014 | * there is more than one task running on that queue to | ||
1015 | * prevent idle rebalance from trying to pull tasks from a | ||
1016 | * queue with only one running task. | ||
1017 | */ | ||
1018 | source_load = source_load * rq->prio_bias / running; | ||
1019 | |||
1020 | return source_load; | ||
1021 | } | ||
939 | 1022 | ||
940 | return min(rq->cpu_load[type-1], load_now); | 1023 | static inline unsigned long source_load(int cpu, int type) |
1024 | { | ||
1025 | return __source_load(cpu, type, NOT_IDLE); | ||
941 | } | 1026 | } |
942 | 1027 | ||
943 | /* | 1028 | /* |
944 | * Return a high guess at the load of a migration-target cpu | 1029 | * Return a high guess at the load of a migration-target cpu |
945 | */ | 1030 | */ |
946 | static inline unsigned long target_load(int cpu, int type) | 1031 | static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) |
947 | { | 1032 | { |
948 | runqueue_t *rq = cpu_rq(cpu); | 1033 | runqueue_t *rq = cpu_rq(cpu); |
949 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1034 | unsigned long running = rq->nr_running; |
1035 | unsigned long target_load, cpu_load = rq->cpu_load[type-1], | ||
1036 | load_now = running * SCHED_LOAD_SCALE; | ||
1037 | |||
950 | if (type == 0) | 1038 | if (type == 0) |
951 | return load_now; | 1039 | target_load = load_now; |
1040 | else | ||
1041 | target_load = max(cpu_load, load_now); | ||
1042 | |||
1043 | if (running > 1 || (idle == NOT_IDLE && running)) | ||
1044 | target_load = target_load * rq->prio_bias / running; | ||
1045 | |||
1046 | return target_load; | ||
1047 | } | ||
952 | 1048 | ||
953 | return max(rq->cpu_load[type-1], load_now); | 1049 | static inline unsigned long target_load(int cpu, int type) |
1050 | { | ||
1051 | return __target_load(cpu, type, NOT_IDLE); | ||
954 | } | 1052 | } |
955 | 1053 | ||
956 | /* | 1054 | /* |
@@ -1339,7 +1437,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1339 | #endif | 1437 | #endif |
1340 | #ifdef CONFIG_PREEMPT | 1438 | #ifdef CONFIG_PREEMPT |
1341 | /* Want to start with kernel preemption disabled. */ | 1439 | /* Want to start with kernel preemption disabled. */ |
1342 | p->thread_info->preempt_count = 1; | 1440 | task_thread_info(p)->preempt_count = 1; |
1343 | #endif | 1441 | #endif |
1344 | /* | 1442 | /* |
1345 | * Share the timeslice between parent and child, thus the | 1443 | * Share the timeslice between parent and child, thus the |
@@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1411 | list_add_tail(&p->run_list, ¤t->run_list); | 1509 | list_add_tail(&p->run_list, ¤t->run_list); |
1412 | p->array = current->array; | 1510 | p->array = current->array; |
1413 | p->array->nr_active++; | 1511 | p->array->nr_active++; |
1414 | rq->nr_running++; | 1512 | inc_nr_running(p, rq); |
1415 | } | 1513 | } |
1416 | set_need_resched(); | 1514 | set_need_resched(); |
1417 | } else | 1515 | } else |
@@ -1468,7 +1566,7 @@ void fastcall sched_exit(task_t *p) | |||
1468 | * the sleep_avg of the parent as well. | 1566 | * the sleep_avg of the parent as well. |
1469 | */ | 1567 | */ |
1470 | rq = task_rq_lock(p->parent, &flags); | 1568 | rq = task_rq_lock(p->parent, &flags); |
1471 | if (p->first_time_slice) { | 1569 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
1472 | p->parent->time_slice += p->time_slice; | 1570 | p->parent->time_slice += p->time_slice; |
1473 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | 1571 | if (unlikely(p->parent->time_slice > task_timeslice(p))) |
1474 | p->parent->time_slice = task_timeslice(p); | 1572 | p->parent->time_slice = task_timeslice(p); |
@@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1756 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1854 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1757 | { | 1855 | { |
1758 | dequeue_task(p, src_array); | 1856 | dequeue_task(p, src_array); |
1759 | src_rq->nr_running--; | 1857 | dec_nr_running(p, src_rq); |
1760 | set_task_cpu(p, this_cpu); | 1858 | set_task_cpu(p, this_cpu); |
1761 | this_rq->nr_running++; | 1859 | inc_nr_running(p, this_rq); |
1762 | enqueue_task(p, this_array); | 1860 | enqueue_task(p, this_array); |
1763 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1861 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1764 | + this_rq->timestamp_last_tick; | 1862 | + this_rq->timestamp_last_tick; |
@@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1937 | 2035 | ||
1938 | /* Bias balancing toward cpus of our domain */ | 2036 | /* Bias balancing toward cpus of our domain */ |
1939 | if (local_group) | 2037 | if (local_group) |
1940 | load = target_load(i, load_idx); | 2038 | load = __target_load(i, load_idx, idle); |
1941 | else | 2039 | else |
1942 | load = source_load(i, load_idx); | 2040 | load = __source_load(i, load_idx, idle); |
1943 | 2041 | ||
1944 | avg_load += load; | 2042 | avg_load += load; |
1945 | } | 2043 | } |
@@ -2044,14 +2142,15 @@ out_balanced: | |||
2044 | /* | 2142 | /* |
2045 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2143 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2046 | */ | 2144 | */ |
2047 | static runqueue_t *find_busiest_queue(struct sched_group *group) | 2145 | static runqueue_t *find_busiest_queue(struct sched_group *group, |
2146 | enum idle_type idle) | ||
2048 | { | 2147 | { |
2049 | unsigned long load, max_load = 0; | 2148 | unsigned long load, max_load = 0; |
2050 | runqueue_t *busiest = NULL; | 2149 | runqueue_t *busiest = NULL; |
2051 | int i; | 2150 | int i; |
2052 | 2151 | ||
2053 | for_each_cpu_mask(i, group->cpumask) { | 2152 | for_each_cpu_mask(i, group->cpumask) { |
2054 | load = source_load(i, 0); | 2153 | load = __source_load(i, 0, idle); |
2055 | 2154 | ||
2056 | if (load > max_load) { | 2155 | if (load > max_load) { |
2057 | max_load = load; | 2156 | max_load = load; |
@@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2095 | goto out_balanced; | 2194 | goto out_balanced; |
2096 | } | 2195 | } |
2097 | 2196 | ||
2098 | busiest = find_busiest_queue(group); | 2197 | busiest = find_busiest_queue(group, idle); |
2099 | if (!busiest) { | 2198 | if (!busiest) { |
2100 | schedstat_inc(sd, lb_nobusyq[idle]); | 2199 | schedstat_inc(sd, lb_nobusyq[idle]); |
2101 | goto out_balanced; | 2200 | goto out_balanced; |
@@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2218 | goto out_balanced; | 2317 | goto out_balanced; |
2219 | } | 2318 | } |
2220 | 2319 | ||
2221 | busiest = find_busiest_queue(group); | 2320 | busiest = find_busiest_queue(group, NEWLY_IDLE); |
2222 | if (!busiest) { | 2321 | if (!busiest) { |
2223 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2322 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2224 | goto out_balanced; | 2323 | goto out_balanced; |
@@ -2511,8 +2610,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
2511 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 2610 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
2512 | /* Account for system time used */ | 2611 | /* Account for system time used */ |
2513 | acct_update_integrals(p); | 2612 | acct_update_integrals(p); |
2514 | /* Update rss highwater mark */ | ||
2515 | update_mem_hiwater(p); | ||
2516 | } | 2613 | } |
2517 | 2614 | ||
2518 | /* | 2615 | /* |
@@ -3453,8 +3550,10 @@ void set_user_nice(task_t *p, long nice) | |||
3453 | goto out_unlock; | 3550 | goto out_unlock; |
3454 | } | 3551 | } |
3455 | array = p->array; | 3552 | array = p->array; |
3456 | if (array) | 3553 | if (array) { |
3457 | dequeue_task(p, array); | 3554 | dequeue_task(p, array); |
3555 | dec_prio_bias(rq, p->static_prio); | ||
3556 | } | ||
3458 | 3557 | ||
3459 | old_prio = p->prio; | 3558 | old_prio = p->prio; |
3460 | new_prio = NICE_TO_PRIO(nice); | 3559 | new_prio = NICE_TO_PRIO(nice); |
@@ -3464,6 +3563,7 @@ void set_user_nice(task_t *p, long nice) | |||
3464 | 3563 | ||
3465 | if (array) { | 3564 | if (array) { |
3466 | enqueue_task(p, array); | 3565 | enqueue_task(p, array); |
3566 | inc_prio_bias(rq, p->static_prio); | ||
3467 | /* | 3567 | /* |
3468 | * If the task increased its priority or is running and | 3568 | * If the task increased its priority or is running and |
3469 | * lowered its priority, then reschedule its CPU: | 3569 | * lowered its priority, then reschedule its CPU: |
@@ -3565,8 +3665,6 @@ int idle_cpu(int cpu) | |||
3565 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 3665 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
3566 | } | 3666 | } |
3567 | 3667 | ||
3568 | EXPORT_SYMBOL_GPL(idle_cpu); | ||
3569 | |||
3570 | /** | 3668 | /** |
3571 | * idle_task - return the idle task for a given cpu. | 3669 | * idle_task - return the idle task for a given cpu. |
3572 | * @cpu: the processor in question. | 3670 | * @cpu: the processor in question. |
@@ -4229,10 +4327,10 @@ static void show_task(task_t *p) | |||
4229 | #endif | 4327 | #endif |
4230 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4328 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4231 | { | 4329 | { |
4232 | unsigned long *n = (unsigned long *) (p->thread_info+1); | 4330 | unsigned long *n = end_of_stack(p); |
4233 | while (!*n) | 4331 | while (!*n) |
4234 | n++; | 4332 | n++; |
4235 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); | 4333 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
4236 | } | 4334 | } |
4237 | #endif | 4335 | #endif |
4238 | printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); | 4336 | printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); |
@@ -4312,9 +4410,9 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4312 | 4410 | ||
4313 | /* Set the preempt count _outside_ the spinlocks! */ | 4411 | /* Set the preempt count _outside_ the spinlocks! */ |
4314 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | 4412 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) |
4315 | idle->thread_info->preempt_count = (idle->lock_depth >= 0); | 4413 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); |
4316 | #else | 4414 | #else |
4317 | idle->thread_info->preempt_count = 0; | 4415 | task_thread_info(idle)->preempt_count = 0; |
4318 | #endif | 4416 | #endif |
4319 | } | 4417 | } |
4320 | 4418 | ||
@@ -4682,7 +4780,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4682 | #ifdef CONFIG_HOTPLUG_CPU | 4780 | #ifdef CONFIG_HOTPLUG_CPU |
4683 | case CPU_UP_CANCELED: | 4781 | case CPU_UP_CANCELED: |
4684 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 4782 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
4685 | kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); | 4783 | kthread_bind(cpu_rq(cpu)->migration_thread, |
4784 | any_online_cpu(cpu_online_map)); | ||
4686 | kthread_stop(cpu_rq(cpu)->migration_thread); | 4785 | kthread_stop(cpu_rq(cpu)->migration_thread); |
4687 | cpu_rq(cpu)->migration_thread = NULL; | 4786 | cpu_rq(cpu)->migration_thread = NULL; |
4688 | break; | 4787 | break; |
diff --git a/kernel/signal.c b/kernel/signal.c index b92c3c9f8b9a..d7611f189ef7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask) | |||
262 | return sig; | 262 | return sig; |
263 | } | 263 | } |
264 | 264 | ||
265 | static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, | 265 | static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, |
266 | int override_rlimit) | 266 | int override_rlimit) |
267 | { | 267 | { |
268 | struct sigqueue *q = NULL; | 268 | struct sigqueue *q = NULL; |
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n | |||
277 | } else { | 277 | } else { |
278 | INIT_LIST_HEAD(&q->list); | 278 | INIT_LIST_HEAD(&q->list); |
279 | q->flags = 0; | 279 | q->flags = 0; |
280 | q->lock = NULL; | ||
281 | q->user = get_uid(t->user); | 280 | q->user = get_uid(t->user); |
282 | } | 281 | } |
283 | return(q); | 282 | return(q); |
@@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk) | |||
397 | flush_sigqueue(&tsk->pending); | 396 | flush_sigqueue(&tsk->pending); |
398 | if (sig) { | 397 | if (sig) { |
399 | /* | 398 | /* |
400 | * We are cleaning up the signal_struct here. We delayed | 399 | * We are cleaning up the signal_struct here. |
401 | * calling exit_itimers until after flush_sigqueue, just in | ||
402 | * case our thread-local pending queue contained a queued | ||
403 | * timer signal that would have been cleared in | ||
404 | * exit_itimers. When that called sigqueue_free, it would | ||
405 | * attempt to re-take the tasklist_lock and deadlock. This | ||
406 | * can never happen if we ensure that all queues the | ||
407 | * timer's signal might be queued on have been flushed | ||
408 | * first. The shared_pending queue, and our own pending | ||
409 | * queue are the only queues the timer could be on, since | ||
410 | * there are no other threads left in the group and timer | ||
411 | * signals are constrained to threads inside the group. | ||
412 | */ | 400 | */ |
413 | exit_itimers(sig); | ||
414 | exit_thread_group_keys(sig); | 401 | exit_thread_group_keys(sig); |
415 | kmem_cache_free(signal_cachep, sig); | 402 | kmem_cache_free(signal_cachep, sig); |
416 | } | 403 | } |
@@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk) | |||
418 | 405 | ||
419 | void exit_signal(struct task_struct *tsk) | 406 | void exit_signal(struct task_struct *tsk) |
420 | { | 407 | { |
408 | atomic_dec(&tsk->signal->live); | ||
409 | |||
421 | write_lock_irq(&tasklist_lock); | 410 | write_lock_irq(&tasklist_lock); |
422 | __exit_signal(tsk); | 411 | __exit_signal(tsk); |
423 | write_unlock_irq(&tasklist_lock); | 412 | write_unlock_irq(&tasklist_lock); |
@@ -524,16 +513,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, | |||
524 | { | 513 | { |
525 | int sig = 0; | 514 | int sig = 0; |
526 | 515 | ||
527 | /* SIGKILL must have priority, otherwise it is quite easy | 516 | sig = next_signal(pending, mask); |
528 | * to create an unkillable process, sending sig < SIGKILL | ||
529 | * to self */ | ||
530 | if (unlikely(sigismember(&pending->signal, SIGKILL))) { | ||
531 | if (!sigismember(mask, SIGKILL)) | ||
532 | sig = SIGKILL; | ||
533 | } | ||
534 | |||
535 | if (likely(!sig)) | ||
536 | sig = next_signal(pending, mask); | ||
537 | if (sig) { | 517 | if (sig) { |
538 | if (current->notifier) { | 518 | if (current->notifier) { |
539 | if (sigismember(current->notifier_mask, sig)) { | 519 | if (sigismember(current->notifier_mask, sig)) { |
@@ -578,7 +558,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | |||
578 | * is to alert stop-signal processing code when another | 558 | * is to alert stop-signal processing code when another |
579 | * processor has come along and cleared the flag. | 559 | * processor has come along and cleared the flag. |
580 | */ | 560 | */ |
581 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | 561 | if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) |
562 | tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; | ||
582 | } | 563 | } |
583 | if ( signr && | 564 | if ( signr && |
584 | ((info->si_code & __SI_MASK) == __SI_TIMER) && | 565 | ((info->si_code & __SI_MASK) == __SI_TIMER) && |
@@ -661,8 +642,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
661 | if (!valid_signal(sig)) | 642 | if (!valid_signal(sig)) |
662 | return error; | 643 | return error; |
663 | error = -EPERM; | 644 | error = -EPERM; |
664 | if ((!info || ((unsigned long)info != 1 && | 645 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
665 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
666 | && ((sig != SIGCONT) || | 646 | && ((sig != SIGCONT) || |
667 | (current->signal->session != t->signal->session)) | 647 | (current->signal->session != t->signal->session)) |
668 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 648 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
@@ -799,7 +779,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
799 | * fast-pathed signals for kernel-internal things like SIGSTOP | 779 | * fast-pathed signals for kernel-internal things like SIGSTOP |
800 | * or SIGKILL. | 780 | * or SIGKILL. |
801 | */ | 781 | */ |
802 | if ((unsigned long)info == 2) | 782 | if (info == SEND_SIG_FORCED) |
803 | goto out_set; | 783 | goto out_set; |
804 | 784 | ||
805 | /* Real-time signals must be queued if sent by sigqueue, or | 785 | /* Real-time signals must be queued if sent by sigqueue, or |
@@ -811,19 +791,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
811 | pass on the info struct. */ | 791 | pass on the info struct. */ |
812 | 792 | ||
813 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && | 793 | q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && |
814 | ((unsigned long) info < 2 || | 794 | (is_si_special(info) || |
815 | info->si_code >= 0))); | 795 | info->si_code >= 0))); |
816 | if (q) { | 796 | if (q) { |
817 | list_add_tail(&q->list, &signals->list); | 797 | list_add_tail(&q->list, &signals->list); |
818 | switch ((unsigned long) info) { | 798 | switch ((unsigned long) info) { |
819 | case 0: | 799 | case (unsigned long) SEND_SIG_NOINFO: |
820 | q->info.si_signo = sig; | 800 | q->info.si_signo = sig; |
821 | q->info.si_errno = 0; | 801 | q->info.si_errno = 0; |
822 | q->info.si_code = SI_USER; | 802 | q->info.si_code = SI_USER; |
823 | q->info.si_pid = current->pid; | 803 | q->info.si_pid = current->pid; |
824 | q->info.si_uid = current->uid; | 804 | q->info.si_uid = current->uid; |
825 | break; | 805 | break; |
826 | case 1: | 806 | case (unsigned long) SEND_SIG_PRIV: |
827 | q->info.si_signo = sig; | 807 | q->info.si_signo = sig; |
828 | q->info.si_errno = 0; | 808 | q->info.si_errno = 0; |
829 | q->info.si_code = SI_KERNEL; | 809 | q->info.si_code = SI_KERNEL; |
@@ -834,20 +814,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t, | |||
834 | copy_siginfo(&q->info, info); | 814 | copy_siginfo(&q->info, info); |
835 | break; | 815 | break; |
836 | } | 816 | } |
837 | } else { | 817 | } else if (!is_si_special(info)) { |
838 | if (sig >= SIGRTMIN && info && (unsigned long)info != 1 | 818 | if (sig >= SIGRTMIN && info->si_code != SI_USER) |
839 | && info->si_code != SI_USER) | ||
840 | /* | 819 | /* |
841 | * Queue overflow, abort. We may abort if the signal was rt | 820 | * Queue overflow, abort. We may abort if the signal was rt |
842 | * and sent by user using something other than kill(). | 821 | * and sent by user using something other than kill(). |
843 | */ | 822 | */ |
844 | return -EAGAIN; | 823 | return -EAGAIN; |
845 | if (((unsigned long)info > 1) && (info->si_code == SI_TIMER)) | ||
846 | /* | ||
847 | * Set up a return to indicate that we dropped | ||
848 | * the signal. | ||
849 | */ | ||
850 | ret = info->si_sys_private; | ||
851 | } | 824 | } |
852 | 825 | ||
853 | out_set: | 826 | out_set: |
@@ -868,12 +841,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
868 | BUG(); | 841 | BUG(); |
869 | assert_spin_locked(&t->sighand->siglock); | 842 | assert_spin_locked(&t->sighand->siglock); |
870 | 843 | ||
871 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
872 | /* | ||
873 | * Set up a return to indicate that we dropped the signal. | ||
874 | */ | ||
875 | ret = info->si_sys_private; | ||
876 | |||
877 | /* Short-circuit ignored signals. */ | 844 | /* Short-circuit ignored signals. */ |
878 | if (sig_ignored(t, sig)) | 845 | if (sig_ignored(t, sig)) |
879 | goto out; | 846 | goto out; |
@@ -903,11 +870,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
903 | int ret; | 870 | int ret; |
904 | 871 | ||
905 | spin_lock_irqsave(&t->sighand->siglock, flags); | 872 | spin_lock_irqsave(&t->sighand->siglock, flags); |
906 | if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { | 873 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { |
907 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | 874 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; |
875 | } | ||
876 | if (sigismember(&t->blocked, sig)) { | ||
908 | sigdelset(&t->blocked, sig); | 877 | sigdelset(&t->blocked, sig); |
909 | recalc_sigpending_tsk(t); | ||
910 | } | 878 | } |
879 | recalc_sigpending_tsk(t); | ||
911 | ret = specific_send_sig_info(sig, info, t); | 880 | ret = specific_send_sig_info(sig, info, t); |
912 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | 881 | spin_unlock_irqrestore(&t->sighand->siglock, flags); |
913 | 882 | ||
@@ -917,15 +886,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |||
917 | void | 886 | void |
918 | force_sig_specific(int sig, struct task_struct *t) | 887 | force_sig_specific(int sig, struct task_struct *t) |
919 | { | 888 | { |
920 | unsigned long int flags; | 889 | force_sig_info(sig, SEND_SIG_FORCED, t); |
921 | |||
922 | spin_lock_irqsave(&t->sighand->siglock, flags); | ||
923 | if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) | ||
924 | t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; | ||
925 | sigdelset(&t->blocked, sig); | ||
926 | recalc_sigpending_tsk(t); | ||
927 | specific_send_sig_info(sig, (void *)2, t); | ||
928 | spin_unlock_irqrestore(&t->sighand->siglock, flags); | ||
929 | } | 890 | } |
930 | 891 | ||
931 | /* | 892 | /* |
@@ -936,34 +897,31 @@ force_sig_specific(int sig, struct task_struct *t) | |||
936 | * as soon as they're available, so putting the signal on the shared queue | 897 | * as soon as they're available, so putting the signal on the shared queue |
937 | * will be equivalent to sending it to one such thread. | 898 | * will be equivalent to sending it to one such thread. |
938 | */ | 899 | */ |
939 | #define wants_signal(sig, p, mask) \ | 900 | static inline int wants_signal(int sig, struct task_struct *p) |
940 | (!sigismember(&(p)->blocked, sig) \ | 901 | { |
941 | && !((p)->state & mask) \ | 902 | if (sigismember(&p->blocked, sig)) |
942 | && !((p)->flags & PF_EXITING) \ | 903 | return 0; |
943 | && (task_curr(p) || !signal_pending(p))) | 904 | if (p->flags & PF_EXITING) |
944 | 905 | return 0; | |
906 | if (sig == SIGKILL) | ||
907 | return 1; | ||
908 | if (p->state & (TASK_STOPPED | TASK_TRACED)) | ||
909 | return 0; | ||
910 | return task_curr(p) || !signal_pending(p); | ||
911 | } | ||
945 | 912 | ||
946 | static void | 913 | static void |
947 | __group_complete_signal(int sig, struct task_struct *p) | 914 | __group_complete_signal(int sig, struct task_struct *p) |
948 | { | 915 | { |
949 | unsigned int mask; | ||
950 | struct task_struct *t; | 916 | struct task_struct *t; |
951 | 917 | ||
952 | /* | 918 | /* |
953 | * Don't bother traced and stopped tasks (but | ||
954 | * SIGKILL will punch through that). | ||
955 | */ | ||
956 | mask = TASK_STOPPED | TASK_TRACED; | ||
957 | if (sig == SIGKILL) | ||
958 | mask = 0; | ||
959 | |||
960 | /* | ||
961 | * Now find a thread we can wake up to take the signal off the queue. | 919 | * Now find a thread we can wake up to take the signal off the queue. |
962 | * | 920 | * |
963 | * If the main thread wants the signal, it gets first crack. | 921 | * If the main thread wants the signal, it gets first crack. |
964 | * Probably the least surprising to the average bear. | 922 | * Probably the least surprising to the average bear. |
965 | */ | 923 | */ |
966 | if (wants_signal(sig, p, mask)) | 924 | if (wants_signal(sig, p)) |
967 | t = p; | 925 | t = p; |
968 | else if (thread_group_empty(p)) | 926 | else if (thread_group_empty(p)) |
969 | /* | 927 | /* |
@@ -981,7 +939,7 @@ __group_complete_signal(int sig, struct task_struct *p) | |||
981 | t = p->signal->curr_target = p; | 939 | t = p->signal->curr_target = p; |
982 | BUG_ON(t->tgid != p->tgid); | 940 | BUG_ON(t->tgid != p->tgid); |
983 | 941 | ||
984 | while (!wants_signal(sig, t, mask)) { | 942 | while (!wants_signal(sig, t)) { |
985 | t = next_thread(t); | 943 | t = next_thread(t); |
986 | if (t == p->signal->curr_target) | 944 | if (t == p->signal->curr_target) |
987 | /* | 945 | /* |
@@ -1063,12 +1021,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1063 | assert_spin_locked(&p->sighand->siglock); | 1021 | assert_spin_locked(&p->sighand->siglock); |
1064 | handle_stop_signal(sig, p); | 1022 | handle_stop_signal(sig, p); |
1065 | 1023 | ||
1066 | if (((unsigned long)info > 2) && (info->si_code == SI_TIMER)) | ||
1067 | /* | ||
1068 | * Set up a return to indicate that we dropped the signal. | ||
1069 | */ | ||
1070 | ret = info->si_sys_private; | ||
1071 | |||
1072 | /* Short-circuit ignored signals. */ | 1024 | /* Short-circuit ignored signals. */ |
1073 | if (sig_ignored(p, sig)) | 1025 | if (sig_ignored(p, sig)) |
1074 | return ret; | 1026 | return ret; |
@@ -1121,8 +1073,8 @@ void zap_other_threads(struct task_struct *p) | |||
1121 | if (t != p->group_leader) | 1073 | if (t != p->group_leader) |
1122 | t->exit_signal = -1; | 1074 | t->exit_signal = -1; |
1123 | 1075 | ||
1076 | /* SIGKILL will be handled before any pending SIGSTOP */ | ||
1124 | sigaddset(&t->pending.signal, SIGKILL); | 1077 | sigaddset(&t->pending.signal, SIGKILL); |
1125 | rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); | ||
1126 | signal_wake_up(t, 1); | 1078 | signal_wake_up(t, 1); |
1127 | } | 1079 | } |
1128 | } | 1080 | } |
@@ -1195,6 +1147,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) | |||
1195 | return error; | 1147 | return error; |
1196 | } | 1148 | } |
1197 | 1149 | ||
1150 | /* like kill_proc_info(), but doesn't use uid/euid of "current" */ | ||
1151 | int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | ||
1152 | uid_t uid, uid_t euid) | ||
1153 | { | ||
1154 | int ret = -EINVAL; | ||
1155 | struct task_struct *p; | ||
1156 | |||
1157 | if (!valid_signal(sig)) | ||
1158 | return ret; | ||
1159 | |||
1160 | read_lock(&tasklist_lock); | ||
1161 | p = find_task_by_pid(pid); | ||
1162 | if (!p) { | ||
1163 | ret = -ESRCH; | ||
1164 | goto out_unlock; | ||
1165 | } | ||
1166 | if ((!info || ((unsigned long)info != 1 && | ||
1167 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
1168 | && (euid != p->suid) && (euid != p->uid) | ||
1169 | && (uid != p->suid) && (uid != p->uid)) { | ||
1170 | ret = -EPERM; | ||
1171 | goto out_unlock; | ||
1172 | } | ||
1173 | if (sig && p->sighand) { | ||
1174 | unsigned long flags; | ||
1175 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1176 | ret = __group_send_sig_info(sig, info, p); | ||
1177 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1178 | } | ||
1179 | out_unlock: | ||
1180 | read_unlock(&tasklist_lock); | ||
1181 | return ret; | ||
1182 | } | ||
1183 | EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); | ||
1198 | 1184 | ||
1199 | /* | 1185 | /* |
1200 | * kill_something_info() interprets pid in interesting ways just like kill(2). | 1186 | * kill_something_info() interprets pid in interesting ways just like kill(2). |
@@ -1264,10 +1250,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1264 | return ret; | 1250 | return ret; |
1265 | } | 1251 | } |
1266 | 1252 | ||
1253 | #define __si_special(priv) \ | ||
1254 | ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) | ||
1255 | |||
1267 | int | 1256 | int |
1268 | send_sig(int sig, struct task_struct *p, int priv) | 1257 | send_sig(int sig, struct task_struct *p, int priv) |
1269 | { | 1258 | { |
1270 | return send_sig_info(sig, (void*)(long)(priv != 0), p); | 1259 | return send_sig_info(sig, __si_special(priv), p); |
1271 | } | 1260 | } |
1272 | 1261 | ||
1273 | /* | 1262 | /* |
@@ -1287,7 +1276,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p) | |||
1287 | void | 1276 | void |
1288 | force_sig(int sig, struct task_struct *p) | 1277 | force_sig(int sig, struct task_struct *p) |
1289 | { | 1278 | { |
1290 | force_sig_info(sig, (void*)1L, p); | 1279 | force_sig_info(sig, SEND_SIG_PRIV, p); |
1291 | } | 1280 | } |
1292 | 1281 | ||
1293 | /* | 1282 | /* |
@@ -1312,13 +1301,13 @@ force_sigsegv(int sig, struct task_struct *p) | |||
1312 | int | 1301 | int |
1313 | kill_pg(pid_t pgrp, int sig, int priv) | 1302 | kill_pg(pid_t pgrp, int sig, int priv) |
1314 | { | 1303 | { |
1315 | return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); | 1304 | return kill_pg_info(sig, __si_special(priv), pgrp); |
1316 | } | 1305 | } |
1317 | 1306 | ||
1318 | int | 1307 | int |
1319 | kill_proc(pid_t pid, int sig, int priv) | 1308 | kill_proc(pid_t pid, int sig, int priv) |
1320 | { | 1309 | { |
1321 | return kill_proc_info(sig, (void *)(long)(priv != 0), pid); | 1310 | return kill_proc_info(sig, __si_special(priv), pid); |
1322 | } | 1311 | } |
1323 | 1312 | ||
1324 | /* | 1313 | /* |
@@ -1349,11 +1338,12 @@ void sigqueue_free(struct sigqueue *q) | |||
1349 | * pending queue. | 1338 | * pending queue. |
1350 | */ | 1339 | */ |
1351 | if (unlikely(!list_empty(&q->list))) { | 1340 | if (unlikely(!list_empty(&q->list))) { |
1352 | read_lock(&tasklist_lock); | 1341 | spinlock_t *lock = ¤t->sighand->siglock; |
1353 | spin_lock_irqsave(q->lock, flags); | 1342 | read_lock(&tasklist_lock); |
1343 | spin_lock_irqsave(lock, flags); | ||
1354 | if (!list_empty(&q->list)) | 1344 | if (!list_empty(&q->list)) |
1355 | list_del_init(&q->list); | 1345 | list_del_init(&q->list); |
1356 | spin_unlock_irqrestore(q->lock, flags); | 1346 | spin_unlock_irqrestore(lock, flags); |
1357 | read_unlock(&tasklist_lock); | 1347 | read_unlock(&tasklist_lock); |
1358 | } | 1348 | } |
1359 | q->flags &= ~SIGQUEUE_PREALLOC; | 1349 | q->flags &= ~SIGQUEUE_PREALLOC; |
@@ -1392,7 +1382,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1392 | goto out; | 1382 | goto out; |
1393 | } | 1383 | } |
1394 | 1384 | ||
1395 | q->lock = &p->sighand->siglock; | ||
1396 | list_add_tail(&q->list, &p->pending.list); | 1385 | list_add_tail(&q->list, &p->pending.list); |
1397 | sigaddset(&p->pending.signal, sig); | 1386 | sigaddset(&p->pending.signal, sig); |
1398 | if (!sigismember(&p->blocked, sig)) | 1387 | if (!sigismember(&p->blocked, sig)) |
@@ -1440,7 +1429,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1440 | * We always use the shared queue for process-wide signals, | 1429 | * We always use the shared queue for process-wide signals, |
1441 | * to avoid several races. | 1430 | * to avoid several races. |
1442 | */ | 1431 | */ |
1443 | q->lock = &p->sighand->siglock; | ||
1444 | list_add_tail(&q->list, &p->signal->shared_pending.list); | 1432 | list_add_tail(&q->list, &p->signal->shared_pending.list); |
1445 | sigaddset(&p->signal->shared_pending.signal, sig); | 1433 | sigaddset(&p->signal->shared_pending.signal, sig); |
1446 | 1434 | ||
@@ -1502,7 +1490,7 @@ void do_notify_parent(struct task_struct *tsk, int sig) | |||
1502 | 1490 | ||
1503 | psig = tsk->parent->sighand; | 1491 | psig = tsk->parent->sighand; |
1504 | spin_lock_irqsave(&psig->siglock, flags); | 1492 | spin_lock_irqsave(&psig->siglock, flags); |
1505 | if (sig == SIGCHLD && | 1493 | if (!tsk->ptrace && sig == SIGCHLD && |
1506 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || | 1494 | (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || |
1507 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { | 1495 | (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { |
1508 | /* | 1496 | /* |
@@ -1766,7 +1754,8 @@ do_signal_stop(int signr) | |||
1766 | * stop is always done with the siglock held, | 1754 | * stop is always done with the siglock held, |
1767 | * so this check has no races. | 1755 | * so this check has no races. |
1768 | */ | 1756 | */ |
1769 | if (t->state < TASK_STOPPED) { | 1757 | if (!t->exit_state && |
1758 | !(t->state & (TASK_STOPPED|TASK_TRACED))) { | ||
1770 | stop_count++; | 1759 | stop_count++; |
1771 | signal_wake_up(t, 0); | 1760 | signal_wake_up(t, 0); |
1772 | } | 1761 | } |
@@ -1858,9 +1847,9 @@ relock: | |||
1858 | /* Let the debugger run. */ | 1847 | /* Let the debugger run. */ |
1859 | ptrace_stop(signr, signr, info); | 1848 | ptrace_stop(signr, signr, info); |
1860 | 1849 | ||
1861 | /* We're back. Did the debugger cancel the sig? */ | 1850 | /* We're back. Did the debugger cancel the sig or group_exit? */ |
1862 | signr = current->exit_code; | 1851 | signr = current->exit_code; |
1863 | if (signr == 0) | 1852 | if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT) |
1864 | continue; | 1853 | continue; |
1865 | 1854 | ||
1866 | current->exit_code = 0; | 1855 | current->exit_code = 0; |
@@ -2262,26 +2251,13 @@ sys_kill(int pid, int sig) | |||
2262 | return kill_something_info(sig, &info, pid); | 2251 | return kill_something_info(sig, &info, pid); |
2263 | } | 2252 | } |
2264 | 2253 | ||
2265 | /** | 2254 | static int do_tkill(int tgid, int pid, int sig) |
2266 | * sys_tgkill - send signal to one specific thread | ||
2267 | * @tgid: the thread group ID of the thread | ||
2268 | * @pid: the PID of the thread | ||
2269 | * @sig: signal to be sent | ||
2270 | * | ||
2271 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
2272 | * exists but it's not belonging to the target process anymore. This | ||
2273 | * method solves the problem of threads exiting and PIDs getting reused. | ||
2274 | */ | ||
2275 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
2276 | { | 2255 | { |
2277 | struct siginfo info; | ||
2278 | int error; | 2256 | int error; |
2257 | struct siginfo info; | ||
2279 | struct task_struct *p; | 2258 | struct task_struct *p; |
2280 | 2259 | ||
2281 | /* This is only valid for single tasks */ | 2260 | error = -ESRCH; |
2282 | if (pid <= 0 || tgid <= 0) | ||
2283 | return -EINVAL; | ||
2284 | |||
2285 | info.si_signo = sig; | 2261 | info.si_signo = sig; |
2286 | info.si_errno = 0; | 2262 | info.si_errno = 0; |
2287 | info.si_code = SI_TKILL; | 2263 | info.si_code = SI_TKILL; |
@@ -2290,8 +2266,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
2290 | 2266 | ||
2291 | read_lock(&tasklist_lock); | 2267 | read_lock(&tasklist_lock); |
2292 | p = find_task_by_pid(pid); | 2268 | p = find_task_by_pid(pid); |
2293 | error = -ESRCH; | 2269 | if (p && (tgid <= 0 || p->tgid == tgid)) { |
2294 | if (p && (p->tgid == tgid)) { | ||
2295 | error = check_kill_permission(sig, &info, p); | 2270 | error = check_kill_permission(sig, &info, p); |
2296 | /* | 2271 | /* |
2297 | * The null signal is a permissions and process existence | 2272 | * The null signal is a permissions and process existence |
@@ -2305,47 +2280,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig) | |||
2305 | } | 2280 | } |
2306 | } | 2281 | } |
2307 | read_unlock(&tasklist_lock); | 2282 | read_unlock(&tasklist_lock); |
2283 | |||
2308 | return error; | 2284 | return error; |
2309 | } | 2285 | } |
2310 | 2286 | ||
2287 | /** | ||
2288 | * sys_tgkill - send signal to one specific thread | ||
2289 | * @tgid: the thread group ID of the thread | ||
2290 | * @pid: the PID of the thread | ||
2291 | * @sig: signal to be sent | ||
2292 | * | ||
2293 | * This syscall also checks the tgid and returns -ESRCH even if the PID | ||
2294 | * exists but it's not belonging to the target process anymore. This | ||
2295 | * method solves the problem of threads exiting and PIDs getting reused. | ||
2296 | */ | ||
2297 | asmlinkage long sys_tgkill(int tgid, int pid, int sig) | ||
2298 | { | ||
2299 | /* This is only valid for single tasks */ | ||
2300 | if (pid <= 0 || tgid <= 0) | ||
2301 | return -EINVAL; | ||
2302 | |||
2303 | return do_tkill(tgid, pid, sig); | ||
2304 | } | ||
2305 | |||
2311 | /* | 2306 | /* |
2312 | * Send a signal to only one task, even if it's a CLONE_THREAD task. | 2307 | * Send a signal to only one task, even if it's a CLONE_THREAD task. |
2313 | */ | 2308 | */ |
2314 | asmlinkage long | 2309 | asmlinkage long |
2315 | sys_tkill(int pid, int sig) | 2310 | sys_tkill(int pid, int sig) |
2316 | { | 2311 | { |
2317 | struct siginfo info; | ||
2318 | int error; | ||
2319 | struct task_struct *p; | ||
2320 | |||
2321 | /* This is only valid for single tasks */ | 2312 | /* This is only valid for single tasks */ |
2322 | if (pid <= 0) | 2313 | if (pid <= 0) |
2323 | return -EINVAL; | 2314 | return -EINVAL; |
2324 | 2315 | ||
2325 | info.si_signo = sig; | 2316 | return do_tkill(0, pid, sig); |
2326 | info.si_errno = 0; | ||
2327 | info.si_code = SI_TKILL; | ||
2328 | info.si_pid = current->tgid; | ||
2329 | info.si_uid = current->uid; | ||
2330 | |||
2331 | read_lock(&tasklist_lock); | ||
2332 | p = find_task_by_pid(pid); | ||
2333 | error = -ESRCH; | ||
2334 | if (p) { | ||
2335 | error = check_kill_permission(sig, &info, p); | ||
2336 | /* | ||
2337 | * The null signal is a permissions and process existence | ||
2338 | * probe. No signal is actually delivered. | ||
2339 | */ | ||
2340 | if (!error && sig && p->sighand) { | ||
2341 | spin_lock_irq(&p->sighand->siglock); | ||
2342 | handle_stop_signal(sig, p); | ||
2343 | error = specific_send_sig_info(sig, &info, p); | ||
2344 | spin_unlock_irq(&p->sighand->siglock); | ||
2345 | } | ||
2346 | } | ||
2347 | read_unlock(&tasklist_lock); | ||
2348 | return error; | ||
2349 | } | 2317 | } |
2350 | 2318 | ||
2351 | asmlinkage long | 2319 | asmlinkage long |
diff --git a/kernel/softirq.c b/kernel/softirq.c index f766b2fc48be..ad3295cdded5 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
470 | #ifdef CONFIG_HOTPLUG_CPU | 470 | #ifdef CONFIG_HOTPLUG_CPU |
471 | case CPU_UP_CANCELED: | 471 | case CPU_UP_CANCELED: |
472 | /* Unbind so it can run. Fall thru. */ | 472 | /* Unbind so it can run. Fall thru. */ |
473 | kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); | 473 | kthread_bind(per_cpu(ksoftirqd, hotcpu), |
474 | any_online_cpu(cpu_online_map)); | ||
474 | case CPU_DEAD: | 475 | case CPU_DEAD: |
475 | p = per_cpu(ksoftirqd, hotcpu); | 476 | p = per_cpu(ksoftirqd, hotcpu); |
476 | per_cpu(ksoftirqd, hotcpu) = NULL; | 477 | per_cpu(ksoftirqd, hotcpu) = NULL; |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 75976209cea7..c67189a25d52 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs) | |||
73 | static int watchdog(void * __bind_cpu) | 73 | static int watchdog(void * __bind_cpu) |
74 | { | 74 | { |
75 | struct sched_param param = { .sched_priority = 99 }; | 75 | struct sched_param param = { .sched_priority = 99 }; |
76 | int this_cpu = (long) __bind_cpu; | ||
77 | |||
78 | printk("softlockup thread %d started up.\n", this_cpu); | ||
79 | 76 | ||
80 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 77 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
81 | current->flags |= PF_NOFREEZE; | 78 | current->flags |= PF_NOFREEZE; |
@@ -123,7 +120,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
123 | #ifdef CONFIG_HOTPLUG_CPU | 120 | #ifdef CONFIG_HOTPLUG_CPU |
124 | case CPU_UP_CANCELED: | 121 | case CPU_UP_CANCELED: |
125 | /* Unbind so it can run. Fall thru. */ | 122 | /* Unbind so it can run. Fall thru. */ |
126 | kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); | 123 | kthread_bind(per_cpu(watchdog_task, hotcpu), |
124 | any_online_cpu(cpu_online_map)); | ||
127 | case CPU_DEAD: | 125 | case CPU_DEAD: |
128 | p = per_cpu(watchdog_task, hotcpu); | 126 | p = per_cpu(watchdog_task, hotcpu); |
129 | per_cpu(watchdog_task, hotcpu) = NULL; | 127 | per_cpu(watchdog_task, hotcpu) = NULL; |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 84a9d18aa8da..b3d4dc858e35 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -119,13 +119,12 @@ static int stop_machine(void) | |||
119 | return ret; | 119 | return ret; |
120 | } | 120 | } |
121 | 121 | ||
122 | /* Don't schedule us away at this point, please. */ | ||
123 | local_irq_disable(); | ||
124 | |||
125 | /* Now they are all started, make them hold the CPUs, ready. */ | 122 | /* Now they are all started, make them hold the CPUs, ready. */ |
123 | preempt_disable(); | ||
126 | stopmachine_set_state(STOPMACHINE_PREPARE); | 124 | stopmachine_set_state(STOPMACHINE_PREPARE); |
127 | 125 | ||
128 | /* Make them disable irqs. */ | 126 | /* Make them disable irqs. */ |
127 | local_irq_disable(); | ||
129 | stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); | 128 | stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); |
130 | 129 | ||
131 | return 0; | 130 | return 0; |
@@ -135,6 +134,7 @@ static void restart_machine(void) | |||
135 | { | 134 | { |
136 | stopmachine_set_state(STOPMACHINE_EXIT); | 135 | stopmachine_set_state(STOPMACHINE_EXIT); |
137 | local_irq_enable(); | 136 | local_irq_enable(); |
137 | preempt_enable_no_resched(); | ||
138 | } | 138 | } |
139 | 139 | ||
140 | struct stop_machine_data | 140 | struct stop_machine_data |
diff --git a/kernel/sys.c b/kernel/sys.c index f723522e6986..bce933ebb29f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
29 | #include <linux/tty.h> | 29 | #include <linux/tty.h> |
30 | #include <linux/signal.h> | 30 | #include <linux/signal.h> |
31 | #include <linux/cn_proc.h> | ||
31 | 32 | ||
32 | #include <linux/compat.h> | 33 | #include <linux/compat.h> |
33 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
@@ -361,17 +362,38 @@ out_unlock: | |||
361 | return retval; | 362 | return retval; |
362 | } | 363 | } |
363 | 364 | ||
365 | /** | ||
366 | * emergency_restart - reboot the system | ||
367 | * | ||
368 | * Without shutting down any hardware or taking any locks | ||
369 | * reboot the system. This is called when we know we are in | ||
370 | * trouble so this is our best effort to reboot. This is | ||
371 | * safe to call in interrupt context. | ||
372 | */ | ||
364 | void emergency_restart(void) | 373 | void emergency_restart(void) |
365 | { | 374 | { |
366 | machine_emergency_restart(); | 375 | machine_emergency_restart(); |
367 | } | 376 | } |
368 | EXPORT_SYMBOL_GPL(emergency_restart); | 377 | EXPORT_SYMBOL_GPL(emergency_restart); |
369 | 378 | ||
370 | void kernel_restart(char *cmd) | 379 | void kernel_restart_prepare(char *cmd) |
371 | { | 380 | { |
372 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); | 381 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); |
373 | system_state = SYSTEM_RESTART; | 382 | system_state = SYSTEM_RESTART; |
374 | device_shutdown(); | 383 | device_shutdown(); |
384 | } | ||
385 | |||
386 | /** | ||
387 | * kernel_restart - reboot the system | ||
388 | * @cmd: pointer to buffer containing command to execute for restart | ||
389 | * or %NULL | ||
390 | * | ||
391 | * Shutdown everything and perform a clean reboot. | ||
392 | * This is not safe to call in interrupt context. | ||
393 | */ | ||
394 | void kernel_restart(char *cmd) | ||
395 | { | ||
396 | kernel_restart_prepare(cmd); | ||
375 | if (!cmd) { | 397 | if (!cmd) { |
376 | printk(KERN_EMERG "Restarting system.\n"); | 398 | printk(KERN_EMERG "Restarting system.\n"); |
377 | } else { | 399 | } else { |
@@ -382,6 +404,12 @@ void kernel_restart(char *cmd) | |||
382 | } | 404 | } |
383 | EXPORT_SYMBOL_GPL(kernel_restart); | 405 | EXPORT_SYMBOL_GPL(kernel_restart); |
384 | 406 | ||
407 | /** | ||
408 | * kernel_kexec - reboot the system | ||
409 | * | ||
410 | * Move into place and start executing a preloaded standalone | ||
411 | * executable. If nothing was preloaded return an error. | ||
412 | */ | ||
385 | void kernel_kexec(void) | 413 | void kernel_kexec(void) |
386 | { | 414 | { |
387 | #ifdef CONFIG_KEXEC | 415 | #ifdef CONFIG_KEXEC |
@@ -390,9 +418,7 @@ void kernel_kexec(void) | |||
390 | if (!image) { | 418 | if (!image) { |
391 | return; | 419 | return; |
392 | } | 420 | } |
393 | notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); | 421 | kernel_restart_prepare(NULL); |
394 | system_state = SYSTEM_RESTART; | ||
395 | device_shutdown(); | ||
396 | printk(KERN_EMERG "Starting new kernel\n"); | 422 | printk(KERN_EMERG "Starting new kernel\n"); |
397 | machine_shutdown(); | 423 | machine_shutdown(); |
398 | machine_kexec(image); | 424 | machine_kexec(image); |
@@ -400,21 +426,39 @@ void kernel_kexec(void) | |||
400 | } | 426 | } |
401 | EXPORT_SYMBOL_GPL(kernel_kexec); | 427 | EXPORT_SYMBOL_GPL(kernel_kexec); |
402 | 428 | ||
403 | void kernel_halt(void) | 429 | /** |
430 | * kernel_halt - halt the system | ||
431 | * | ||
432 | * Shutdown everything and perform a clean system halt. | ||
433 | */ | ||
434 | void kernel_halt_prepare(void) | ||
404 | { | 435 | { |
405 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); | 436 | notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); |
406 | system_state = SYSTEM_HALT; | 437 | system_state = SYSTEM_HALT; |
407 | device_shutdown(); | 438 | device_shutdown(); |
439 | } | ||
440 | void kernel_halt(void) | ||
441 | { | ||
442 | kernel_halt_prepare(); | ||
408 | printk(KERN_EMERG "System halted.\n"); | 443 | printk(KERN_EMERG "System halted.\n"); |
409 | machine_halt(); | 444 | machine_halt(); |
410 | } | 445 | } |
411 | EXPORT_SYMBOL_GPL(kernel_halt); | 446 | EXPORT_SYMBOL_GPL(kernel_halt); |
412 | 447 | ||
413 | void kernel_power_off(void) | 448 | /** |
449 | * kernel_power_off - power_off the system | ||
450 | * | ||
451 | * Shutdown everything and perform a clean system power_off. | ||
452 | */ | ||
453 | void kernel_power_off_prepare(void) | ||
414 | { | 454 | { |
415 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); | 455 | notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); |
416 | system_state = SYSTEM_POWER_OFF; | 456 | system_state = SYSTEM_POWER_OFF; |
417 | device_shutdown(); | 457 | device_shutdown(); |
458 | } | ||
459 | void kernel_power_off(void) | ||
460 | { | ||
461 | kernel_power_off_prepare(); | ||
418 | printk(KERN_EMERG "Power down.\n"); | 462 | printk(KERN_EMERG "Power down.\n"); |
419 | machine_power_off(); | 463 | machine_power_off(); |
420 | } | 464 | } |
@@ -583,6 +627,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid) | |||
583 | current->egid = new_egid; | 627 | current->egid = new_egid; |
584 | current->gid = new_rgid; | 628 | current->gid = new_rgid; |
585 | key_fsgid_changed(current); | 629 | key_fsgid_changed(current); |
630 | proc_id_connector(current, PROC_EVENT_GID); | ||
586 | return 0; | 631 | return 0; |
587 | } | 632 | } |
588 | 633 | ||
@@ -622,6 +667,7 @@ asmlinkage long sys_setgid(gid_t gid) | |||
622 | return -EPERM; | 667 | return -EPERM; |
623 | 668 | ||
624 | key_fsgid_changed(current); | 669 | key_fsgid_changed(current); |
670 | proc_id_connector(current, PROC_EVENT_GID); | ||
625 | return 0; | 671 | return 0; |
626 | } | 672 | } |
627 | 673 | ||
@@ -711,6 +757,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
711 | current->fsuid = current->euid; | 757 | current->fsuid = current->euid; |
712 | 758 | ||
713 | key_fsuid_changed(current); | 759 | key_fsuid_changed(current); |
760 | proc_id_connector(current, PROC_EVENT_UID); | ||
714 | 761 | ||
715 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); | 762 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); |
716 | } | 763 | } |
@@ -758,6 +805,7 @@ asmlinkage long sys_setuid(uid_t uid) | |||
758 | current->suid = new_suid; | 805 | current->suid = new_suid; |
759 | 806 | ||
760 | key_fsuid_changed(current); | 807 | key_fsuid_changed(current); |
808 | proc_id_connector(current, PROC_EVENT_UID); | ||
761 | 809 | ||
762 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); | 810 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); |
763 | } | 811 | } |
@@ -806,6 +854,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) | |||
806 | current->suid = suid; | 854 | current->suid = suid; |
807 | 855 | ||
808 | key_fsuid_changed(current); | 856 | key_fsuid_changed(current); |
857 | proc_id_connector(current, PROC_EVENT_UID); | ||
809 | 858 | ||
810 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); | 859 | return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); |
811 | } | 860 | } |
@@ -858,6 +907,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid) | |||
858 | current->sgid = sgid; | 907 | current->sgid = sgid; |
859 | 908 | ||
860 | key_fsgid_changed(current); | 909 | key_fsgid_changed(current); |
910 | proc_id_connector(current, PROC_EVENT_GID); | ||
861 | return 0; | 911 | return 0; |
862 | } | 912 | } |
863 | 913 | ||
@@ -900,6 +950,7 @@ asmlinkage long sys_setfsuid(uid_t uid) | |||
900 | } | 950 | } |
901 | 951 | ||
902 | key_fsuid_changed(current); | 952 | key_fsuid_changed(current); |
953 | proc_id_connector(current, PROC_EVENT_UID); | ||
903 | 954 | ||
904 | security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); | 955 | security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); |
905 | 956 | ||
@@ -928,6 +979,7 @@ asmlinkage long sys_setfsgid(gid_t gid) | |||
928 | } | 979 | } |
929 | current->fsgid = gid; | 980 | current->fsgid = gid; |
930 | key_fsgid_changed(current); | 981 | key_fsgid_changed(current); |
982 | proc_id_connector(current, PROC_EVENT_GID); | ||
931 | } | 983 | } |
932 | return old_fsgid; | 984 | return old_fsgid; |
933 | } | 985 | } |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e56e2495542..9990e10192e8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = { | |||
169 | 169 | ||
170 | extern struct proc_dir_entry *proc_sys_root; | 170 | extern struct proc_dir_entry *proc_sys_root; |
171 | 171 | ||
172 | static void register_proc_table(ctl_table *, struct proc_dir_entry *); | 172 | static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *); |
173 | static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); | 173 | static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); |
174 | #endif | 174 | #endif |
175 | 175 | ||
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = { | |||
952 | .data = &aio_nr, | 952 | .data = &aio_nr, |
953 | .maxlen = sizeof(aio_nr), | 953 | .maxlen = sizeof(aio_nr), |
954 | .mode = 0444, | 954 | .mode = 0444, |
955 | .proc_handler = &proc_dointvec, | 955 | .proc_handler = &proc_doulongvec_minmax, |
956 | }, | 956 | }, |
957 | { | 957 | { |
958 | .ctl_name = FS_AIO_MAX_NR, | 958 | .ctl_name = FS_AIO_MAX_NR, |
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = { | |||
960 | .data = &aio_max_nr, | 960 | .data = &aio_max_nr, |
961 | .maxlen = sizeof(aio_max_nr), | 961 | .maxlen = sizeof(aio_max_nr), |
962 | .mode = 0644, | 962 | .mode = 0644, |
963 | .proc_handler = &proc_dointvec, | 963 | .proc_handler = &proc_doulongvec_minmax, |
964 | }, | 964 | }, |
965 | #ifdef CONFIG_INOTIFY | 965 | #ifdef CONFIG_INOTIFY |
966 | { | 966 | { |
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = { | |||
992 | 992 | ||
993 | extern void init_irq_proc (void); | 993 | extern void init_irq_proc (void); |
994 | 994 | ||
995 | static DEFINE_SPINLOCK(sysctl_lock); | ||
996 | |||
997 | /* called under sysctl_lock */ | ||
998 | static int use_table(struct ctl_table_header *p) | ||
999 | { | ||
1000 | if (unlikely(p->unregistering)) | ||
1001 | return 0; | ||
1002 | p->used++; | ||
1003 | return 1; | ||
1004 | } | ||
1005 | |||
1006 | /* called under sysctl_lock */ | ||
1007 | static void unuse_table(struct ctl_table_header *p) | ||
1008 | { | ||
1009 | if (!--p->used) | ||
1010 | if (unlikely(p->unregistering)) | ||
1011 | complete(p->unregistering); | ||
1012 | } | ||
1013 | |||
1014 | /* called under sysctl_lock, will reacquire if has to wait */ | ||
1015 | static void start_unregistering(struct ctl_table_header *p) | ||
1016 | { | ||
1017 | /* | ||
1018 | * if p->used is 0, nobody will ever touch that entry again; | ||
1019 | * we'll eliminate all paths to it before dropping sysctl_lock | ||
1020 | */ | ||
1021 | if (unlikely(p->used)) { | ||
1022 | struct completion wait; | ||
1023 | init_completion(&wait); | ||
1024 | p->unregistering = &wait; | ||
1025 | spin_unlock(&sysctl_lock); | ||
1026 | wait_for_completion(&wait); | ||
1027 | spin_lock(&sysctl_lock); | ||
1028 | } | ||
1029 | /* | ||
1030 | * do not remove from the list until nobody holds it; walking the | ||
1031 | * list in do_sysctl() relies on that. | ||
1032 | */ | ||
1033 | list_del_init(&p->ctl_entry); | ||
1034 | } | ||
1035 | |||
995 | void __init sysctl_init(void) | 1036 | void __init sysctl_init(void) |
996 | { | 1037 | { |
997 | #ifdef CONFIG_PROC_FS | 1038 | #ifdef CONFIG_PROC_FS |
998 | register_proc_table(root_table, proc_sys_root); | 1039 | register_proc_table(root_table, proc_sys_root, &root_table_header); |
999 | init_irq_proc(); | 1040 | init_irq_proc(); |
1000 | #endif | 1041 | #endif |
1001 | } | 1042 | } |
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1004 | void __user *newval, size_t newlen) | 1045 | void __user *newval, size_t newlen) |
1005 | { | 1046 | { |
1006 | struct list_head *tmp; | 1047 | struct list_head *tmp; |
1048 | int error = -ENOTDIR; | ||
1007 | 1049 | ||
1008 | if (nlen <= 0 || nlen >= CTL_MAXNAME) | 1050 | if (nlen <= 0 || nlen >= CTL_MAXNAME) |
1009 | return -ENOTDIR; | 1051 | return -ENOTDIR; |
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1012 | if (!oldlenp || get_user(old_len, oldlenp)) | 1054 | if (!oldlenp || get_user(old_len, oldlenp)) |
1013 | return -EFAULT; | 1055 | return -EFAULT; |
1014 | } | 1056 | } |
1057 | spin_lock(&sysctl_lock); | ||
1015 | tmp = &root_table_header.ctl_entry; | 1058 | tmp = &root_table_header.ctl_entry; |
1016 | do { | 1059 | do { |
1017 | struct ctl_table_header *head = | 1060 | struct ctl_table_header *head = |
1018 | list_entry(tmp, struct ctl_table_header, ctl_entry); | 1061 | list_entry(tmp, struct ctl_table_header, ctl_entry); |
1019 | void *context = NULL; | 1062 | void *context = NULL; |
1020 | int error = parse_table(name, nlen, oldval, oldlenp, | 1063 | |
1064 | if (!use_table(head)) | ||
1065 | continue; | ||
1066 | |||
1067 | spin_unlock(&sysctl_lock); | ||
1068 | |||
1069 | error = parse_table(name, nlen, oldval, oldlenp, | ||
1021 | newval, newlen, head->ctl_table, | 1070 | newval, newlen, head->ctl_table, |
1022 | &context); | 1071 | &context); |
1023 | kfree(context); | 1072 | kfree(context); |
1073 | |||
1074 | spin_lock(&sysctl_lock); | ||
1075 | unuse_table(head); | ||
1024 | if (error != -ENOTDIR) | 1076 | if (error != -ENOTDIR) |
1025 | return error; | 1077 | break; |
1026 | tmp = tmp->next; | 1078 | } while ((tmp = tmp->next) != &root_table_header.ctl_entry); |
1027 | } while (tmp != &root_table_header.ctl_entry); | 1079 | spin_unlock(&sysctl_lock); |
1028 | return -ENOTDIR; | 1080 | return error; |
1029 | } | 1081 | } |
1030 | 1082 | ||
1031 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) | 1083 | asmlinkage long sys_sysctl(struct __sysctl_args __user *args) |
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, | |||
1236 | return NULL; | 1288 | return NULL; |
1237 | tmp->ctl_table = table; | 1289 | tmp->ctl_table = table; |
1238 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1290 | INIT_LIST_HEAD(&tmp->ctl_entry); |
1291 | tmp->used = 0; | ||
1292 | tmp->unregistering = NULL; | ||
1293 | spin_lock(&sysctl_lock); | ||
1239 | if (insert_at_head) | 1294 | if (insert_at_head) |
1240 | list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1295 | list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); |
1241 | else | 1296 | else |
1242 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1297 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); |
1298 | spin_unlock(&sysctl_lock); | ||
1243 | #ifdef CONFIG_PROC_FS | 1299 | #ifdef CONFIG_PROC_FS |
1244 | register_proc_table(table, proc_sys_root); | 1300 | register_proc_table(table, proc_sys_root, tmp); |
1245 | #endif | 1301 | #endif |
1246 | return tmp; | 1302 | return tmp; |
1247 | } | 1303 | } |
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table, | |||
1255 | */ | 1311 | */ |
1256 | void unregister_sysctl_table(struct ctl_table_header * header) | 1312 | void unregister_sysctl_table(struct ctl_table_header * header) |
1257 | { | 1313 | { |
1258 | list_del(&header->ctl_entry); | 1314 | might_sleep(); |
1315 | spin_lock(&sysctl_lock); | ||
1316 | start_unregistering(header); | ||
1259 | #ifdef CONFIG_PROC_FS | 1317 | #ifdef CONFIG_PROC_FS |
1260 | unregister_proc_table(header->ctl_table, proc_sys_root); | 1318 | unregister_proc_table(header->ctl_table, proc_sys_root); |
1261 | #endif | 1319 | #endif |
1320 | spin_unlock(&sysctl_lock); | ||
1262 | kfree(header); | 1321 | kfree(header); |
1263 | } | 1322 | } |
1264 | 1323 | ||
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header) | |||
1269 | #ifdef CONFIG_PROC_FS | 1328 | #ifdef CONFIG_PROC_FS |
1270 | 1329 | ||
1271 | /* Scan the sysctl entries in table and add them all into /proc */ | 1330 | /* Scan the sysctl entries in table and add them all into /proc */ |
1272 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) | 1331 | static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) |
1273 | { | 1332 | { |
1274 | struct proc_dir_entry *de; | 1333 | struct proc_dir_entry *de; |
1275 | int len; | 1334 | int len; |
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) | |||
1305 | de = create_proc_entry(table->procname, mode, root); | 1364 | de = create_proc_entry(table->procname, mode, root); |
1306 | if (!de) | 1365 | if (!de) |
1307 | continue; | 1366 | continue; |
1367 | de->set = set; | ||
1308 | de->data = (void *) table; | 1368 | de->data = (void *) table; |
1309 | if (table->proc_handler) | 1369 | if (table->proc_handler) |
1310 | de->proc_fops = &proc_sys_file_operations; | 1370 | de->proc_fops = &proc_sys_file_operations; |
1311 | } | 1371 | } |
1312 | table->de = de; | 1372 | table->de = de; |
1313 | if (de->mode & S_IFDIR) | 1373 | if (de->mode & S_IFDIR) |
1314 | register_proc_table(table->child, de); | 1374 | register_proc_table(table->child, de, set); |
1315 | } | 1375 | } |
1316 | } | 1376 | } |
1317 | 1377 | ||
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root | |||
1336 | continue; | 1396 | continue; |
1337 | } | 1397 | } |
1338 | 1398 | ||
1399 | /* | ||
1400 | * In any case, mark the entry as goner; we'll keep it | ||
1401 | * around if it's busy, but we'll know to do nothing with | ||
1402 | * its fields. We are under sysctl_lock here. | ||
1403 | */ | ||
1404 | de->data = NULL; | ||
1405 | |||
1339 | /* Don't unregister proc entries that are still being used.. */ | 1406 | /* Don't unregister proc entries that are still being used.. */ |
1340 | if (atomic_read(&de->count)) | 1407 | if (atomic_read(&de->count)) |
1341 | continue; | 1408 | continue; |
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | |||
1349 | size_t count, loff_t *ppos) | 1416 | size_t count, loff_t *ppos) |
1350 | { | 1417 | { |
1351 | int op; | 1418 | int op; |
1352 | struct proc_dir_entry *de; | 1419 | struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); |
1353 | struct ctl_table *table; | 1420 | struct ctl_table *table; |
1354 | size_t res; | 1421 | size_t res; |
1355 | ssize_t error; | 1422 | ssize_t error = -ENOTDIR; |
1356 | |||
1357 | de = PDE(file->f_dentry->d_inode); | ||
1358 | if (!de || !de->data) | ||
1359 | return -ENOTDIR; | ||
1360 | table = (struct ctl_table *) de->data; | ||
1361 | if (!table || !table->proc_handler) | ||
1362 | return -ENOTDIR; | ||
1363 | op = (write ? 002 : 004); | ||
1364 | if (ctl_perm(table, op)) | ||
1365 | return -EPERM; | ||
1366 | 1423 | ||
1367 | res = count; | 1424 | spin_lock(&sysctl_lock); |
1368 | 1425 | if (de && de->data && use_table(de->set)) { | |
1369 | error = (*table->proc_handler) (table, write, file, buf, &res, ppos); | 1426 | /* |
1370 | if (error) | 1427 | * at that point we know that sysctl was not unregistered |
1371 | return error; | 1428 | * and won't be until we finish |
1372 | return res; | 1429 | */ |
1430 | spin_unlock(&sysctl_lock); | ||
1431 | table = (struct ctl_table *) de->data; | ||
1432 | if (!table || !table->proc_handler) | ||
1433 | goto out; | ||
1434 | error = -EPERM; | ||
1435 | op = (write ? 002 : 004); | ||
1436 | if (ctl_perm(table, op)) | ||
1437 | goto out; | ||
1438 | |||
1439 | /* careful: calling conventions are nasty here */ | ||
1440 | res = count; | ||
1441 | error = (*table->proc_handler)(table, write, file, | ||
1442 | buf, &res, ppos); | ||
1443 | if (!error) | ||
1444 | error = res; | ||
1445 | out: | ||
1446 | spin_lock(&sysctl_lock); | ||
1447 | unuse_table(de->set); | ||
1448 | } | ||
1449 | spin_unlock(&sysctl_lock); | ||
1450 | return error; | ||
1373 | } | 1451 | } |
1374 | 1452 | ||
1375 | static int proc_opensys(struct inode *inode, struct file *file) | 1453 | static int proc_opensys(struct inode *inode, struct file *file) |
@@ -1997,6 +2075,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp, | |||
1997 | * @filp: the file structure | 2075 | * @filp: the file structure |
1998 | * @buffer: the user buffer | 2076 | * @buffer: the user buffer |
1999 | * @lenp: the size of the user buffer | 2077 | * @lenp: the size of the user buffer |
2078 | * @ppos: pointer to the file position | ||
2000 | * | 2079 | * |
2001 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer | 2080 | * Reads/writes up to table->maxlen/sizeof(unsigned int) integer |
2002 | * values from/to the user buffer, treated as an ASCII string. | 2081 | * values from/to the user buffer, treated as an ASCII string. |
diff --git a/kernel/time.c b/kernel/time.c index dd5ae1162a8f..245d595a13cb 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc) | |||
338 | if (mtemp >= MINSEC) { | 338 | if (mtemp >= MINSEC) { |
339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - | 339 | ltemp = (time_offset / mtemp) << (SHIFT_USEC - |
340 | SHIFT_UPDATE); | 340 | SHIFT_UPDATE); |
341 | if (ltemp < 0) | 341 | time_freq += shift_right(ltemp, SHIFT_KH); |
342 | time_freq -= -ltemp >> SHIFT_KH; | ||
343 | else | ||
344 | time_freq += ltemp >> SHIFT_KH; | ||
345 | } else /* calibration interval too short (p. 12) */ | 342 | } else /* calibration interval too short (p. 12) */ |
346 | result = TIME_ERROR; | 343 | result = TIME_ERROR; |
347 | } else { /* PLL mode */ | 344 | } else { /* PLL mode */ |
348 | if (mtemp < MAXSEC) { | 345 | if (mtemp < MAXSEC) { |
349 | ltemp *= mtemp; | 346 | ltemp *= mtemp; |
350 | if (ltemp < 0) | 347 | time_freq += shift_right(ltemp,(time_constant + |
351 | time_freq -= -ltemp >> (time_constant + | ||
352 | time_constant + | ||
353 | SHIFT_KF - SHIFT_USEC); | ||
354 | else | ||
355 | time_freq += ltemp >> (time_constant + | ||
356 | time_constant + | 348 | time_constant + |
357 | SHIFT_KF - SHIFT_USEC); | 349 | SHIFT_KF - SHIFT_USEC)); |
358 | } else /* calibration interval too long (p. 12) */ | 350 | } else /* calibration interval too long (p. 12) */ |
359 | result = TIME_ERROR; | 351 | result = TIME_ERROR; |
360 | } | 352 | } |
361 | if (time_freq > time_tolerance) | 353 | time_freq = min(time_freq, time_tolerance); |
362 | time_freq = time_tolerance; | 354 | time_freq = max(time_freq, -time_tolerance); |
363 | else if (time_freq < -time_tolerance) | ||
364 | time_freq = -time_tolerance; | ||
365 | } /* STA_PLL || STA_PPSTIME */ | 355 | } /* STA_PLL || STA_PPSTIME */ |
366 | } /* txc->modes & ADJ_OFFSET */ | 356 | } /* txc->modes & ADJ_OFFSET */ |
367 | if (txc->modes & ADJ_TICK) { | 357 | if (txc->modes & ADJ_TICK) { |
@@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0 | |||
384 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 374 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) |
385 | txc->offset = save_adjust; | 375 | txc->offset = save_adjust; |
386 | else { | 376 | else { |
387 | if (time_offset < 0) | 377 | txc->offset = shift_right(time_offset, SHIFT_UPDATE); |
388 | txc->offset = -(-time_offset >> SHIFT_UPDATE); | ||
389 | else | ||
390 | txc->offset = time_offset >> SHIFT_UPDATE; | ||
391 | } | 378 | } |
392 | txc->freq = time_freq + pps_freq; | 379 | txc->freq = time_freq + pps_freq; |
393 | txc->maxerror = time_maxerror; | 380 | txc->maxerror = time_maxerror; |
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv) | |||
532 | clock_was_set(); | 519 | clock_was_set(); |
533 | return 0; | 520 | return 0; |
534 | } | 521 | } |
522 | EXPORT_SYMBOL(do_settimeofday); | ||
535 | 523 | ||
536 | void do_gettimeofday (struct timeval *tv) | 524 | void do_gettimeofday (struct timeval *tv) |
537 | { | 525 | { |
@@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv) | |||
570 | tv->tv_sec = x.tv_sec; | 558 | tv->tv_sec = x.tv_sec; |
571 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; | 559 | tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; |
572 | } | 560 | } |
561 | EXPORT_SYMBOL_GPL(getnstimeofday); | ||
573 | #endif | 562 | #endif |
574 | 563 | ||
575 | #if (BITS_PER_LONG < 64) | 564 | #if (BITS_PER_LONG < 64) |
diff --git a/kernel/timer.c b/kernel/timer.c index 3ba10fa35b60..fd74268d8663 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec); | |||
46 | #define time_interpolator_update(x) | 46 | #define time_interpolator_update(x) |
47 | #endif | 47 | #endif |
48 | 48 | ||
49 | u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; | ||
50 | |||
51 | EXPORT_SYMBOL(jiffies_64); | ||
52 | |||
49 | /* | 53 | /* |
50 | * per-CPU timer vector definitions: | 54 | * per-CPU timer vector definitions: |
51 | */ | 55 | */ |
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base, | |||
91 | #endif | 95 | #endif |
92 | } | 96 | } |
93 | 97 | ||
94 | static void check_timer_failed(struct timer_list *timer) | ||
95 | { | ||
96 | static int whine_count; | ||
97 | if (whine_count < 16) { | ||
98 | whine_count++; | ||
99 | printk("Uninitialised timer!\n"); | ||
100 | printk("This is just a warning. Your computer is OK\n"); | ||
101 | printk("function=0x%p, data=0x%lx\n", | ||
102 | timer->function, timer->data); | ||
103 | dump_stack(); | ||
104 | } | ||
105 | /* | ||
106 | * Now fix it up | ||
107 | */ | ||
108 | timer->magic = TIMER_MAGIC; | ||
109 | } | ||
110 | |||
111 | static inline void check_timer(struct timer_list *timer) | ||
112 | { | ||
113 | if (timer->magic != TIMER_MAGIC) | ||
114 | check_timer_failed(timer); | ||
115 | } | ||
116 | |||
117 | |||
118 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 98 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) |
119 | { | 99 | { |
120 | unsigned long expires = timer->expires; | 100 | unsigned long expires = timer->expires; |
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer) | |||
177 | { | 157 | { |
178 | timer->entry.next = NULL; | 158 | timer->entry.next = NULL; |
179 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; | 159 | timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; |
180 | timer->magic = TIMER_MAGIC; | ||
181 | } | 160 | } |
182 | EXPORT_SYMBOL(init_timer); | 161 | EXPORT_SYMBOL(init_timer); |
183 | 162 | ||
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires) | |||
230 | int ret = 0; | 209 | int ret = 0; |
231 | 210 | ||
232 | BUG_ON(!timer->function); | 211 | BUG_ON(!timer->function); |
233 | check_timer(timer); | ||
234 | 212 | ||
235 | base = lock_timer_base(timer, &flags); | 213 | base = lock_timer_base(timer, &flags); |
236 | 214 | ||
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu) | |||
283 | unsigned long flags; | 261 | unsigned long flags; |
284 | 262 | ||
285 | BUG_ON(timer_pending(timer) || !timer->function); | 263 | BUG_ON(timer_pending(timer) || !timer->function); |
286 | |||
287 | check_timer(timer); | ||
288 | |||
289 | spin_lock_irqsave(&base->t_base.lock, flags); | 264 | spin_lock_irqsave(&base->t_base.lock, flags); |
290 | timer->base = &base->t_base; | 265 | timer->base = &base->t_base; |
291 | internal_add_timer(base, timer); | 266 | internal_add_timer(base, timer); |
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires) | |||
316 | { | 291 | { |
317 | BUG_ON(!timer->function); | 292 | BUG_ON(!timer->function); |
318 | 293 | ||
319 | check_timer(timer); | ||
320 | |||
321 | /* | 294 | /* |
322 | * This is a common optimization triggered by the | 295 | * This is a common optimization triggered by the |
323 | * networking code - if the timer is re-modified | 296 | * networking code - if the timer is re-modified |
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer) | |||
348 | unsigned long flags; | 321 | unsigned long flags; |
349 | int ret = 0; | 322 | int ret = 0; |
350 | 323 | ||
351 | check_timer(timer); | ||
352 | |||
353 | if (timer_pending(timer)) { | 324 | if (timer_pending(timer)) { |
354 | base = lock_timer_base(timer, &flags); | 325 | base = lock_timer_base(timer, &flags); |
355 | if (timer_pending(timer)) { | 326 | if (timer_pending(timer)) { |
@@ -412,8 +383,6 @@ out: | |||
412 | */ | 383 | */ |
413 | int del_timer_sync(struct timer_list *timer) | 384 | int del_timer_sync(struct timer_list *timer) |
414 | { | 385 | { |
415 | check_timer(timer); | ||
416 | |||
417 | for (;;) { | 386 | for (;;) { |
418 | int ret = try_to_del_timer_sync(timer); | 387 | int ret = try_to_del_timer_sync(timer); |
419 | if (ret >= 0) | 388 | if (ret >= 0) |
@@ -632,134 +601,118 @@ long time_next_adjust; | |||
632 | */ | 601 | */ |
633 | static void second_overflow(void) | 602 | static void second_overflow(void) |
634 | { | 603 | { |
635 | long ltemp; | 604 | long ltemp; |
636 | 605 | ||
637 | /* Bump the maxerror field */ | 606 | /* Bump the maxerror field */ |
638 | time_maxerror += time_tolerance >> SHIFT_USEC; | 607 | time_maxerror += time_tolerance >> SHIFT_USEC; |
639 | if ( time_maxerror > NTP_PHASE_LIMIT ) { | 608 | if (time_maxerror > NTP_PHASE_LIMIT) { |
640 | time_maxerror = NTP_PHASE_LIMIT; | 609 | time_maxerror = NTP_PHASE_LIMIT; |
641 | time_status |= STA_UNSYNC; | 610 | time_status |= STA_UNSYNC; |
642 | } | ||
643 | |||
644 | /* | ||
645 | * Leap second processing. If in leap-insert state at | ||
646 | * the end of the day, the system clock is set back one | ||
647 | * second; if in leap-delete state, the system clock is | ||
648 | * set ahead one second. The microtime() routine or | ||
649 | * external clock driver will insure that reported time | ||
650 | * is always monotonic. The ugly divides should be | ||
651 | * replaced. | ||
652 | */ | ||
653 | switch (time_state) { | ||
654 | |||
655 | case TIME_OK: | ||
656 | if (time_status & STA_INS) | ||
657 | time_state = TIME_INS; | ||
658 | else if (time_status & STA_DEL) | ||
659 | time_state = TIME_DEL; | ||
660 | break; | ||
661 | |||
662 | case TIME_INS: | ||
663 | if (xtime.tv_sec % 86400 == 0) { | ||
664 | xtime.tv_sec--; | ||
665 | wall_to_monotonic.tv_sec++; | ||
666 | /* The timer interpolator will make time change gradually instead | ||
667 | * of an immediate jump by one second. | ||
668 | */ | ||
669 | time_interpolator_update(-NSEC_PER_SEC); | ||
670 | time_state = TIME_OOP; | ||
671 | clock_was_set(); | ||
672 | printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); | ||
673 | } | 611 | } |
674 | break; | 612 | |
675 | 613 | /* | |
676 | case TIME_DEL: | 614 | * Leap second processing. If in leap-insert state at the end of the |
677 | if ((xtime.tv_sec + 1) % 86400 == 0) { | 615 | * day, the system clock is set back one second; if in leap-delete |
678 | xtime.tv_sec++; | 616 | * state, the system clock is set ahead one second. The microtime() |
679 | wall_to_monotonic.tv_sec--; | 617 | * routine or external clock driver will insure that reported time is |
680 | /* Use of time interpolator for a gradual change of time */ | 618 | * always monotonic. The ugly divides should be replaced. |
681 | time_interpolator_update(NSEC_PER_SEC); | 619 | */ |
682 | time_state = TIME_WAIT; | 620 | switch (time_state) { |
683 | clock_was_set(); | 621 | case TIME_OK: |
684 | printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); | 622 | if (time_status & STA_INS) |
623 | time_state = TIME_INS; | ||
624 | else if (time_status & STA_DEL) | ||
625 | time_state = TIME_DEL; | ||
626 | break; | ||
627 | case TIME_INS: | ||
628 | if (xtime.tv_sec % 86400 == 0) { | ||
629 | xtime.tv_sec--; | ||
630 | wall_to_monotonic.tv_sec++; | ||
631 | /* | ||
632 | * The timer interpolator will make time change | ||
633 | * gradually instead of an immediate jump by one second | ||
634 | */ | ||
635 | time_interpolator_update(-NSEC_PER_SEC); | ||
636 | time_state = TIME_OOP; | ||
637 | clock_was_set(); | ||
638 | printk(KERN_NOTICE "Clock: inserting leap second " | ||
639 | "23:59:60 UTC\n"); | ||
640 | } | ||
641 | break; | ||
642 | case TIME_DEL: | ||
643 | if ((xtime.tv_sec + 1) % 86400 == 0) { | ||
644 | xtime.tv_sec++; | ||
645 | wall_to_monotonic.tv_sec--; | ||
646 | /* | ||
647 | * Use of time interpolator for a gradual change of | ||
648 | * time | ||
649 | */ | ||
650 | time_interpolator_update(NSEC_PER_SEC); | ||
651 | time_state = TIME_WAIT; | ||
652 | clock_was_set(); | ||
653 | printk(KERN_NOTICE "Clock: deleting leap second " | ||
654 | "23:59:59 UTC\n"); | ||
655 | } | ||
656 | break; | ||
657 | case TIME_OOP: | ||
658 | time_state = TIME_WAIT; | ||
659 | break; | ||
660 | case TIME_WAIT: | ||
661 | if (!(time_status & (STA_INS | STA_DEL))) | ||
662 | time_state = TIME_OK; | ||
685 | } | 663 | } |
686 | break; | 664 | |
687 | 665 | /* | |
688 | case TIME_OOP: | 666 | * Compute the phase adjustment for the next second. In PLL mode, the |
689 | time_state = TIME_WAIT; | 667 | * offset is reduced by a fixed factor times the time constant. In FLL |
690 | break; | 668 | * mode the offset is used directly. In either mode, the maximum phase |
691 | 669 | * adjustment for each second is clamped so as to spread the adjustment | |
692 | case TIME_WAIT: | 670 | * over not more than the number of seconds between updates. |
693 | if (!(time_status & (STA_INS | STA_DEL))) | 671 | */ |
694 | time_state = TIME_OK; | ||
695 | } | ||
696 | |||
697 | /* | ||
698 | * Compute the phase adjustment for the next second. In | ||
699 | * PLL mode, the offset is reduced by a fixed factor | ||
700 | * times the time constant. In FLL mode the offset is | ||
701 | * used directly. In either mode, the maximum phase | ||
702 | * adjustment for each second is clamped so as to spread | ||
703 | * the adjustment over not more than the number of | ||
704 | * seconds between updates. | ||
705 | */ | ||
706 | if (time_offset < 0) { | ||
707 | ltemp = -time_offset; | ||
708 | if (!(time_status & STA_FLL)) | ||
709 | ltemp >>= SHIFT_KG + time_constant; | ||
710 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | ||
711 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | ||
712 | time_offset += ltemp; | ||
713 | time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | ||
714 | } else { | ||
715 | ltemp = time_offset; | 672 | ltemp = time_offset; |
716 | if (!(time_status & STA_FLL)) | 673 | if (!(time_status & STA_FLL)) |
717 | ltemp >>= SHIFT_KG + time_constant; | 674 | ltemp = shift_right(ltemp, SHIFT_KG + time_constant); |
718 | if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) | 675 | ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE); |
719 | ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; | 676 | ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE); |
720 | time_offset -= ltemp; | 677 | time_offset -= ltemp; |
721 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); | 678 | time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); |
722 | } | 679 | |
723 | 680 | /* | |
724 | /* | 681 | * Compute the frequency estimate and additional phase adjustment due |
725 | * Compute the frequency estimate and additional phase | 682 | * to frequency error for the next second. When the PPS signal is |
726 | * adjustment due to frequency error for the next | 683 | * engaged, gnaw on the watchdog counter and update the frequency |
727 | * second. When the PPS signal is engaged, gnaw on the | 684 | * computed by the pll and the PPS signal. |
728 | * watchdog counter and update the frequency computed by | 685 | */ |
729 | * the pll and the PPS signal. | 686 | pps_valid++; |
730 | */ | 687 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ |
731 | pps_valid++; | 688 | pps_jitter = MAXTIME; |
732 | if (pps_valid == PPS_VALID) { /* PPS signal lost */ | 689 | pps_stabil = MAXFREQ; |
733 | pps_jitter = MAXTIME; | 690 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | |
734 | pps_stabil = MAXFREQ; | 691 | STA_PPSWANDER | STA_PPSERROR); |
735 | time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | | 692 | } |
736 | STA_PPSWANDER | STA_PPSERROR); | 693 | ltemp = time_freq + pps_freq; |
737 | } | 694 | time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE)); |
738 | ltemp = time_freq + pps_freq; | ||
739 | if (ltemp < 0) | ||
740 | time_adj -= -ltemp >> | ||
741 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
742 | else | ||
743 | time_adj += ltemp >> | ||
744 | (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); | ||
745 | 695 | ||
746 | #if HZ == 100 | 696 | #if HZ == 100 |
747 | /* Compensate for (HZ==100) != (1 << SHIFT_HZ). | 697 | /* |
748 | * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) | 698 | * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to |
749 | */ | 699 | * get 128.125; => only 0.125% error (p. 14) |
750 | if (time_adj < 0) | 700 | */ |
751 | time_adj -= (-time_adj >> 2) + (-time_adj >> 5); | 701 | time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5); |
752 | else | 702 | #endif |
753 | time_adj += (time_adj >> 2) + (time_adj >> 5); | 703 | #if HZ == 250 |
704 | /* | ||
705 | * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and | ||
706 | * 0.78125% to get 255.85938; => only 0.05% error (p. 14) | ||
707 | */ | ||
708 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); | ||
754 | #endif | 709 | #endif |
755 | #if HZ == 1000 | 710 | #if HZ == 1000 |
756 | /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). | 711 | /* |
757 | * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) | 712 | * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and |
758 | */ | 713 | * 0.78125% to get 1023.4375; => only 0.05% error (p. 14) |
759 | if (time_adj < 0) | 714 | */ |
760 | time_adj -= (-time_adj >> 6) + (-time_adj >> 7); | 715 | time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7); |
761 | else | ||
762 | time_adj += (time_adj >> 6) + (time_adj >> 7); | ||
763 | #endif | 716 | #endif |
764 | } | 717 | } |
765 | 718 | ||
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void) | |||
768 | { | 721 | { |
769 | long time_adjust_step, delta_nsec; | 722 | long time_adjust_step, delta_nsec; |
770 | 723 | ||
771 | if ( (time_adjust_step = time_adjust) != 0 ) { | 724 | if ((time_adjust_step = time_adjust) != 0 ) { |
772 | /* We are doing an adjtime thing. | 725 | /* |
773 | * | 726 | * We are doing an adjtime thing. Prepare time_adjust_step to |
774 | * Prepare time_adjust_step to be within bounds. | 727 | * be within bounds. Note that a positive time_adjust means we |
775 | * Note that a positive time_adjust means we want the clock | 728 | * want the clock to run faster. |
776 | * to run faster. | 729 | * |
777 | * | 730 | * Limit the amount of the step to be in the range |
778 | * Limit the amount of the step to be in the range | 731 | * -tickadj .. +tickadj |
779 | * -tickadj .. +tickadj | 732 | */ |
780 | */ | 733 | time_adjust_step = min(time_adjust_step, (long)tickadj); |
781 | if (time_adjust > tickadj) | 734 | time_adjust_step = max(time_adjust_step, (long)-tickadj); |
782 | time_adjust_step = tickadj; | 735 | |
783 | else if (time_adjust < -tickadj) | 736 | /* Reduce by this step the amount of time left */ |
784 | time_adjust_step = -tickadj; | 737 | time_adjust -= time_adjust_step; |
785 | |||
786 | /* Reduce by this step the amount of time left */ | ||
787 | time_adjust -= time_adjust_step; | ||
788 | } | 738 | } |
789 | delta_nsec = tick_nsec + time_adjust_step * 1000; | 739 | delta_nsec = tick_nsec + time_adjust_step * 1000; |
790 | /* | 740 | /* |
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void) | |||
792 | * advance the tick more. | 742 | * advance the tick more. |
793 | */ | 743 | */ |
794 | time_phase += time_adj; | 744 | time_phase += time_adj; |
795 | if (time_phase <= -FINENSEC) { | 745 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { |
796 | long ltemp = -time_phase >> (SHIFT_SCALE - 10); | 746 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); |
797 | time_phase += ltemp << (SHIFT_SCALE - 10); | ||
798 | delta_nsec -= ltemp; | ||
799 | } | ||
800 | else if (time_phase >= FINENSEC) { | ||
801 | long ltemp = time_phase >> (SHIFT_SCALE - 10); | ||
802 | time_phase -= ltemp << (SHIFT_SCALE - 10); | 747 | time_phase -= ltemp << (SHIFT_SCALE - 10); |
803 | delta_nsec += ltemp; | 748 | delta_nsec += ltemp; |
804 | } | 749 | } |
@@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
1128 | if (timeout < 0) | 1073 | if (timeout < 0) |
1129 | { | 1074 | { |
1130 | printk(KERN_ERR "schedule_timeout: wrong timeout " | 1075 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
1131 | "value %lx from %p\n", timeout, | 1076 | "value %lx from %p\n", timeout, |
1132 | __builtin_return_address(0)); | 1077 | __builtin_return_address(0)); |
1133 | current->state = TASK_RUNNING; | 1078 | current->state = TASK_RUNNING; |
1134 | goto out; | 1079 | goto out; |
1135 | } | 1080 | } |
@@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
1137 | 1082 | ||
1138 | expire = timeout + jiffies; | 1083 | expire = timeout + jiffies; |
1139 | 1084 | ||
1140 | init_timer(&timer); | 1085 | setup_timer(&timer, process_timeout, (unsigned long)current); |
1141 | timer.expires = expire; | 1086 | __mod_timer(&timer, expire); |
1142 | timer.data = (unsigned long) current; | ||
1143 | timer.function = process_timeout; | ||
1144 | |||
1145 | add_timer(&timer); | ||
1146 | schedule(); | 1087 | schedule(); |
1147 | del_singleshot_timer_sync(&timer); | 1088 | del_singleshot_timer_sync(&timer); |
1148 | 1089 | ||
@@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout); | |||
1159 | */ | 1100 | */ |
1160 | signed long __sched schedule_timeout_interruptible(signed long timeout) | 1101 | signed long __sched schedule_timeout_interruptible(signed long timeout) |
1161 | { | 1102 | { |
1162 | __set_current_state(TASK_INTERRUPTIBLE); | 1103 | __set_current_state(TASK_INTERRUPTIBLE); |
1163 | return schedule_timeout(timeout); | 1104 | return schedule_timeout(timeout); |
1164 | } | 1105 | } |
1165 | EXPORT_SYMBOL(schedule_timeout_interruptible); | 1106 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
1166 | 1107 | ||
1167 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | 1108 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
1168 | { | 1109 | { |
1169 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1110 | __set_current_state(TASK_UNINTERRUPTIBLE); |
1170 | return schedule_timeout(timeout); | 1111 | return schedule_timeout(timeout); |
1171 | } | 1112 | } |
1172 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | 1113 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); |
1173 | 1114 | ||
@@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec) | |||
1507 | if (!time_interpolator) | 1448 | if (!time_interpolator) |
1508 | return; | 1449 | return; |
1509 | 1450 | ||
1510 | /* The interpolator compensates for late ticks by accumulating | 1451 | /* |
1511 | * the late time in time_interpolator->offset. A tick earlier than | 1452 | * The interpolator compensates for late ticks by accumulating the late |
1512 | * expected will lead to a reset of the offset and a corresponding | 1453 | * time in time_interpolator->offset. A tick earlier than expected will |
1513 | * jump of the clock forward. Again this only works if the | 1454 | * lead to a reset of the offset and a corresponding jump of the clock |
1514 | * interpolator clock is running slightly slower than the regular clock | 1455 | * forward. Again this only works if the interpolator clock is running |
1515 | * and the tuning logic insures that. | 1456 | * slightly slower than the regular clock and the tuning logic insures |
1516 | */ | 1457 | * that. |
1458 | */ | ||
1517 | 1459 | ||
1518 | counter = time_interpolator_get_counter(1); | 1460 | counter = time_interpolator_get_counter(1); |
1519 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | 1461 | offset = time_interpolator->offset + |
1462 | GET_TI_NSECS(counter, time_interpolator); | ||
1520 | 1463 | ||
1521 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1464 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
1522 | time_interpolator->offset = offset - delta_nsec; | 1465 | time_interpolator->offset = offset - delta_nsec; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 91bacb13a7e2..2bd5aee1c736 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -12,6 +12,8 @@ | |||
12 | * Andrew Morton <andrewm@uow.edu.au> | 12 | * Andrew Morton <andrewm@uow.edu.au> |
13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> | 13 | * Kai Petzke <wpp@marie.physik.tu-berlin.de> |
14 | * Theodore Ts'o <tytso@mit.edu> | 14 | * Theodore Ts'o <tytso@mit.edu> |
15 | * | ||
16 | * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. | ||
15 | */ | 17 | */ |
16 | 18 | ||
17 | #include <linux/module.h> | 19 | #include <linux/module.h> |
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct { | |||
57 | * per-CPU workqueues: | 59 | * per-CPU workqueues: |
58 | */ | 60 | */ |
59 | struct workqueue_struct { | 61 | struct workqueue_struct { |
60 | struct cpu_workqueue_struct cpu_wq[NR_CPUS]; | 62 | struct cpu_workqueue_struct *cpu_wq; |
61 | const char *name; | 63 | const char *name; |
62 | struct list_head list; /* Empty if single thread */ | 64 | struct list_head list; /* Empty if single thread */ |
63 | }; | 65 | }; |
@@ -100,9 +102,9 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
100 | 102 | ||
101 | if (!test_and_set_bit(0, &work->pending)) { | 103 | if (!test_and_set_bit(0, &work->pending)) { |
102 | if (unlikely(is_single_threaded(wq))) | 104 | if (unlikely(is_single_threaded(wq))) |
103 | cpu = 0; | 105 | cpu = any_online_cpu(cpu_online_map); |
104 | BUG_ON(!list_empty(&work->entry)); | 106 | BUG_ON(!list_empty(&work->entry)); |
105 | __queue_work(wq->cpu_wq + cpu, work); | 107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
106 | ret = 1; | 108 | ret = 1; |
107 | } | 109 | } |
108 | put_cpu(); | 110 | put_cpu(); |
@@ -116,9 +118,9 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
116 | int cpu = smp_processor_id(); | 118 | int cpu = smp_processor_id(); |
117 | 119 | ||
118 | if (unlikely(is_single_threaded(wq))) | 120 | if (unlikely(is_single_threaded(wq))) |
119 | cpu = 0; | 121 | cpu = any_online_cpu(cpu_online_map); |
120 | 122 | ||
121 | __queue_work(wq->cpu_wq + cpu, work); | 123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
122 | } | 124 | } |
123 | 125 | ||
124 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 126 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
@@ -264,14 +266,14 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
264 | might_sleep(); | 266 | might_sleep(); |
265 | 267 | ||
266 | if (is_single_threaded(wq)) { | 268 | if (is_single_threaded(wq)) { |
267 | /* Always use cpu 0's area. */ | 269 | /* Always use first cpu's area. */ |
268 | flush_cpu_workqueue(wq->cpu_wq + 0); | 270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); |
269 | } else { | 271 | } else { |
270 | int cpu; | 272 | int cpu; |
271 | 273 | ||
272 | lock_cpu_hotplug(); | 274 | lock_cpu_hotplug(); |
273 | for_each_online_cpu(cpu) | 275 | for_each_online_cpu(cpu) |
274 | flush_cpu_workqueue(wq->cpu_wq + cpu); | 276 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); |
275 | unlock_cpu_hotplug(); | 277 | unlock_cpu_hotplug(); |
276 | } | 278 | } |
277 | } | 279 | } |
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
279 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 281 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
280 | int cpu) | 282 | int cpu) |
281 | { | 283 | { |
282 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 284 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
283 | struct task_struct *p; | 285 | struct task_struct *p; |
284 | 286 | ||
285 | spin_lock_init(&cwq->lock); | 287 | spin_lock_init(&cwq->lock); |
@@ -312,12 +314,13 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
312 | if (!wq) | 314 | if (!wq) |
313 | return NULL; | 315 | return NULL; |
314 | 316 | ||
317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | ||
315 | wq->name = name; | 318 | wq->name = name; |
316 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 319 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
317 | lock_cpu_hotplug(); | 320 | lock_cpu_hotplug(); |
318 | if (singlethread) { | 321 | if (singlethread) { |
319 | INIT_LIST_HEAD(&wq->list); | 322 | INIT_LIST_HEAD(&wq->list); |
320 | p = create_workqueue_thread(wq, 0); | 323 | p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); |
321 | if (!p) | 324 | if (!p) |
322 | destroy = 1; | 325 | destroy = 1; |
323 | else | 326 | else |
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) | |||
353 | unsigned long flags; | 356 | unsigned long flags; |
354 | struct task_struct *p; | 357 | struct task_struct *p; |
355 | 358 | ||
356 | cwq = wq->cpu_wq + cpu; | 359 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
357 | spin_lock_irqsave(&cwq->lock, flags); | 360 | spin_lock_irqsave(&cwq->lock, flags); |
358 | p = cwq->thread; | 361 | p = cwq->thread; |
359 | cwq->thread = NULL; | 362 | cwq->thread = NULL; |
@@ -371,7 +374,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
371 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 374 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
372 | lock_cpu_hotplug(); | 375 | lock_cpu_hotplug(); |
373 | if (is_single_threaded(wq)) | 376 | if (is_single_threaded(wq)) |
374 | cleanup_workqueue_thread(wq, 0); | 377 | cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); |
375 | else { | 378 | else { |
376 | for_each_online_cpu(cpu) | 379 | for_each_online_cpu(cpu) |
377 | cleanup_workqueue_thread(wq, cpu); | 380 | cleanup_workqueue_thread(wq, cpu); |
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
380 | spin_unlock(&workqueue_lock); | 383 | spin_unlock(&workqueue_lock); |
381 | } | 384 | } |
382 | unlock_cpu_hotplug(); | 385 | unlock_cpu_hotplug(); |
386 | free_percpu(wq->cpu_wq); | ||
383 | kfree(wq); | 387 | kfree(wq); |
384 | } | 388 | } |
385 | 389 | ||
@@ -458,7 +462,7 @@ int current_is_keventd(void) | |||
458 | 462 | ||
459 | BUG_ON(!keventd_wq); | 463 | BUG_ON(!keventd_wq); |
460 | 464 | ||
461 | cwq = keventd_wq->cpu_wq + cpu; | 465 | cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu); |
462 | if (current == cwq->thread) | 466 | if (current == cwq->thread) |
463 | ret = 1; | 467 | ret = 1; |
464 | 468 | ||
@@ -470,7 +474,7 @@ int current_is_keventd(void) | |||
470 | /* Take the work from this (downed) CPU. */ | 474 | /* Take the work from this (downed) CPU. */ |
471 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 475 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
472 | { | 476 | { |
473 | struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; | 477 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
474 | LIST_HEAD(list); | 478 | LIST_HEAD(list); |
475 | struct work_struct *work; | 479 | struct work_struct *work; |
476 | 480 | ||
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
481 | printk("Taking work for %s\n", wq->name); | 485 | printk("Taking work for %s\n", wq->name); |
482 | work = list_entry(list.next,struct work_struct,entry); | 486 | work = list_entry(list.next,struct work_struct,entry); |
483 | list_del(&work->entry); | 487 | list_del(&work->entry); |
484 | __queue_work(wq->cpu_wq + smp_processor_id(), work); | 488 | __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work); |
485 | } | 489 | } |
486 | spin_unlock_irq(&cwq->lock); | 490 | spin_unlock_irq(&cwq->lock); |
487 | } | 491 | } |
@@ -508,16 +512,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
508 | case CPU_ONLINE: | 512 | case CPU_ONLINE: |
509 | /* Kick off worker threads. */ | 513 | /* Kick off worker threads. */ |
510 | list_for_each_entry(wq, &workqueues, list) { | 514 | list_for_each_entry(wq, &workqueues, list) { |
511 | kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); | 515 | struct cpu_workqueue_struct *cwq; |
512 | wake_up_process(wq->cpu_wq[hotcpu].thread); | 516 | |
517 | cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); | ||
518 | kthread_bind(cwq->thread, hotcpu); | ||
519 | wake_up_process(cwq->thread); | ||
513 | } | 520 | } |
514 | break; | 521 | break; |
515 | 522 | ||
516 | case CPU_UP_CANCELED: | 523 | case CPU_UP_CANCELED: |
517 | list_for_each_entry(wq, &workqueues, list) { | 524 | list_for_each_entry(wq, &workqueues, list) { |
518 | /* Unbind so it can run. */ | 525 | /* Unbind so it can run. */ |
519 | kthread_bind(wq->cpu_wq[hotcpu].thread, | 526 | kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, |
520 | smp_processor_id()); | 527 | any_online_cpu(cpu_online_map)); |
521 | cleanup_workqueue_thread(wq, hotcpu); | 528 | cleanup_workqueue_thread(wq, hotcpu); |
522 | } | 529 | } |
523 | break; | 530 | break; |