aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c94
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/cpu.c63
-rw-r--r--kernel/cpuset.c480
-rw-r--r--kernel/exit.c41
-rw-r--r--kernel/fork.c53
-rw-r--r--kernel/futex.c22
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/manage.c16
-rw-r--r--kernel/kallsyms.c1
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kfifo.c4
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kprobes.c135
-rw-r--r--kernel/kthread.c13
-rw-r--r--kernel/module.c1
-rw-r--r--kernel/params.c11
-rw-r--r--kernel/posix-cpu-timers.c109
-rw-r--r--kernel/posix-timers.c31
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/disk.c28
-rw-r--r--kernel/power/main.c20
-rw-r--r--kernel/power/pm.c1
-rw-r--r--kernel/power/power.h24
-rw-r--r--kernel/power/snapshot.c453
-rw-r--r--kernel/power/swsusp.c756
-rw-r--r--kernel/printk.c97
-rw-r--r--kernel/ptrace.c94
-rw-r--r--kernel/rcupdate.c23
-rw-r--r--kernel/rcutorture.c514
-rw-r--r--kernel/sched.c179
-rw-r--r--kernel/signal.c250
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/softlockup.c6
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c64
-rw-r--r--kernel/sysctl.c141
-rw-r--r--kernel/time.c27
-rw-r--r--kernel/timer.c337
-rw-r--r--kernel/workqueue.c45
43 files changed, 2573 insertions, 1620 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce170..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
22obj-$(CONFIG_COMPAT) += compat.o 22obj-$(CONFIG_COMPAT) += compat.o
23obj-$(CONFIG_CPUSETS) += cpuset.o 23obj-$(CONFIG_CPUSETS) += cpuset.o
24obj-$(CONFIG_IKCONFIG) += configs.o 24obj-$(CONFIG_IKCONFIG) += configs.o
25obj-$(CONFIG_IKCONFIG_PROC) += configs.o
26obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 25obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
27obj-$(CONFIG_AUDIT) += audit.o 26obj-$(CONFIG_AUDIT) += audit.o
28obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
@@ -32,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
32obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
33obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
34obj-$(CONFIG_SECCOMP) += seccomp.o 33obj-$(CONFIG_SECCOMP) += seccomp.o
34obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
35 35
36ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 36ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
37# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 37# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..6312d6bd43e3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -54,6 +54,7 @@
54#include <linux/jiffies.h> 54#include <linux/jiffies.h>
55#include <linux/times.h> 55#include <linux/times.h>
56#include <linux/syscalls.h> 56#include <linux/syscalls.h>
57#include <linux/mount.h>
57#include <asm/uaccess.h> 58#include <asm/uaccess.h>
58#include <asm/div64.h> 59#include <asm/div64.h>
59#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
@@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file)
192 add_timer(&acct_globals.timer); 193 add_timer(&acct_globals.timer);
193 } 194 }
194 if (old_acct) { 195 if (old_acct) {
196 mnt_unpin(old_acct->f_vfsmnt);
195 spin_unlock(&acct_globals.lock); 197 spin_unlock(&acct_globals.lock);
196 do_acct_process(0, old_acct); 198 do_acct_process(0, old_acct);
197 filp_close(old_acct, NULL); 199 filp_close(old_acct, NULL);
@@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file)
199 } 201 }
200} 202}
201 203
204static int acct_on(char *name)
205{
206 struct file *file;
207 int error;
208
209 /* Difference from BSD - they don't do O_APPEND */
210 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
211 if (IS_ERR(file))
212 return PTR_ERR(file);
213
214 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
215 filp_close(file, NULL);
216 return -EACCES;
217 }
218
219 if (!file->f_op->write) {
220 filp_close(file, NULL);
221 return -EIO;
222 }
223
224 error = security_acct(file);
225 if (error) {
226 filp_close(file, NULL);
227 return error;
228 }
229
230 spin_lock(&acct_globals.lock);
231 mnt_pin(file->f_vfsmnt);
232 acct_file_reopen(file);
233 spin_unlock(&acct_globals.lock);
234
235 mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */
236
237 return 0;
238}
239
202/** 240/**
203 * sys_acct - enable/disable process accounting 241 * sys_acct - enable/disable process accounting
204 * @name: file name for accounting records or NULL to shutdown accounting 242 * @name: file name for accounting records or NULL to shutdown accounting
@@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file)
212 */ 250 */
213asmlinkage long sys_acct(const char __user *name) 251asmlinkage long sys_acct(const char __user *name)
214{ 252{
215 struct file *file = NULL;
216 char *tmp;
217 int error; 253 int error;
218 254
219 if (!capable(CAP_SYS_PACCT)) 255 if (!capable(CAP_SYS_PACCT))
220 return -EPERM; 256 return -EPERM;
221 257
222 if (name) { 258 if (name) {
223 tmp = getname(name); 259 char *tmp = getname(name);
224 if (IS_ERR(tmp)) { 260 if (IS_ERR(tmp))
225 return (PTR_ERR(tmp)); 261 return (PTR_ERR(tmp));
226 } 262 error = acct_on(tmp);
227 /* Difference from BSD - they don't do O_APPEND */
228 file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
229 putname(tmp); 263 putname(tmp);
230 if (IS_ERR(file)) { 264 } else {
231 return (PTR_ERR(file)); 265 error = security_acct(NULL);
232 } 266 if (!error) {
233 if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { 267 spin_lock(&acct_globals.lock);
234 filp_close(file, NULL); 268 acct_file_reopen(NULL);
235 return (-EACCES); 269 spin_unlock(&acct_globals.lock);
236 }
237
238 if (!file->f_op->write) {
239 filp_close(file, NULL);
240 return (-EIO);
241 } 270 }
242 } 271 }
272 return error;
273}
243 274
244 error = security_acct(file); 275/**
245 if (error) { 276 * acct_auto_close - turn off a filesystem's accounting if it is on
246 if (file) 277 * @m: vfsmount being shut down
247 filp_close(file, NULL); 278 *
248 return error; 279 * If the accounting is turned on for a file in the subtree pointed to
249 } 280 * to by m, turn accounting off. Done when m is about to die.
250 281 */
282void acct_auto_close_mnt(struct vfsmount *m)
283{
251 spin_lock(&acct_globals.lock); 284 spin_lock(&acct_globals.lock);
252 acct_file_reopen(file); 285 if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
286 acct_file_reopen(NULL);
253 spin_unlock(&acct_globals.lock); 287 spin_unlock(&acct_globals.lock);
254
255 return (0);
256} 288}
257 289
258/** 290/**
@@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb)
266{ 298{
267 spin_lock(&acct_globals.lock); 299 spin_lock(&acct_globals.lock);
268 if (acct_globals.file && 300 if (acct_globals.file &&
269 acct_globals.file->f_dentry->d_inode->i_sb == sb) { 301 acct_globals.file->f_vfsmnt->mnt_sb == sb) {
270 acct_file_reopen((struct file *)NULL); 302 acct_file_reopen(NULL);
271 } 303 }
272 spin_unlock(&acct_globals.lock); 304 spin_unlock(&acct_globals.lock);
273} 305}
@@ -553,7 +585,7 @@ void acct_update_integrals(struct task_struct *tsk)
553 if (delta == 0) 585 if (delta == 0)
554 return; 586 return;
555 tsk->acct_stimexpd = tsk->stime; 587 tsk->acct_stimexpd = tsk->stime;
556 tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); 588 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
557 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 589 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
558 } 590 }
559} 591}
diff --git a/kernel/audit.c b/kernel/audit.c
index 83096b67510a..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
133 struct list_head list; 133 struct list_head list;
134 struct sk_buff *skb; /* formatted skb ready to send */ 134 struct sk_buff *skb; /* formatted skb ready to send */
135 struct audit_context *ctx; /* NULL or associated context */ 135 struct audit_context *ctx; /* NULL or associated context */
136 int gfp_mask; 136 gfp_t gfp_mask;
137}; 137};
138 138
139static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 139static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
560} 560}
561 561
562static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, 562static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
563 unsigned int __nocast gfp_mask, int type) 563 gfp_t gfp_mask, int type)
564{ 564{
565 unsigned long flags; 565 unsigned long flags;
566 struct audit_buffer *ab = NULL; 566 struct audit_buffer *ab = NULL;
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
647 * will be written at syscall exit. If there is no associated task, tsk 647 * will be written at syscall exit. If there is no associated task, tsk
648 * should be NULL. */ 648 * should be NULL. */
649 649
650struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, 650struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
651 int type) 651 int type)
652{ 652{
653 struct audit_buffer *ab = NULL; 653 struct audit_buffer *ab = NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
879/* Log an audit record. This is a convenience function that calls 879/* Log an audit record. This is a convenience function that calls
880 * audit_log_start, audit_log_vformat, and audit_log_end. It may be 880 * audit_log_start, audit_log_vformat, and audit_log_end. It may be
881 * called in any context. */ 881 * called in any context. */
882void audit_log(struct audit_context *ctx, int gfp_mask, int type, 882void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
883 const char *fmt, ...) 883 const char *fmt, ...)
884{ 884{
885 struct audit_buffer *ab; 885 struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aab..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
803 up_read(&mm->mmap_sem); 803 up_read(&mm->mmap_sem);
804} 804}
805 805
806static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) 806static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
807{ 807{
808 int i; 808 int i;
809 struct audit_buffer *ab; 809 struct audit_buffer *ab;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..e882c6babf41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,28 +16,76 @@
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19DECLARE_MUTEX(cpucontrol); 19static DECLARE_MUTEX(cpucontrol);
20 20
21static struct notifier_block *cpu_chain; 21static struct notifier_block *cpu_chain;
22 22
23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner;
25static int lock_cpu_hotplug_depth;
26
27static int __lock_cpu_hotplug(int interruptible)
28{
29 int ret = 0;
30
31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible)
33 ret = down_interruptible(&cpucontrol);
34 else
35 down(&cpucontrol);
36 }
37
38 /*
39 * Set only if we succeed in locking
40 */
41 if (!ret) {
42 lock_cpu_hotplug_depth++;
43 lock_cpu_hotplug_owner = current;
44 }
45
46 return ret;
47}
48
49void lock_cpu_hotplug(void)
50{
51 __lock_cpu_hotplug(0);
52}
53EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
54
55void unlock_cpu_hotplug(void)
56{
57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol);
60 }
61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
63
64int lock_cpu_hotplug_interruptible(void)
65{
66 return __lock_cpu_hotplug(1);
67}
68EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */
70
23/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
24int register_cpu_notifier(struct notifier_block *nb) 72int register_cpu_notifier(struct notifier_block *nb)
25{ 73{
26 int ret; 74 int ret;
27 75
28 if ((ret = down_interruptible(&cpucontrol)) != 0) 76 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
29 return ret; 77 return ret;
30 ret = notifier_chain_register(&cpu_chain, nb); 78 ret = notifier_chain_register(&cpu_chain, nb);
31 up(&cpucontrol); 79 unlock_cpu_hotplug();
32 return ret; 80 return ret;
33} 81}
34EXPORT_SYMBOL(register_cpu_notifier); 82EXPORT_SYMBOL(register_cpu_notifier);
35 83
36void unregister_cpu_notifier(struct notifier_block *nb) 84void unregister_cpu_notifier(struct notifier_block *nb)
37{ 85{
38 down(&cpucontrol); 86 lock_cpu_hotplug();
39 notifier_chain_unregister(&cpu_chain, nb); 87 notifier_chain_unregister(&cpu_chain, nb);
40 up(&cpucontrol); 88 unlock_cpu_hotplug();
41} 89}
42EXPORT_SYMBOL(unregister_cpu_notifier); 90EXPORT_SYMBOL(unregister_cpu_notifier);
43 91
@@ -155,13 +203,14 @@ int __devinit cpu_up(unsigned int cpu)
155 int ret; 203 int ret;
156 void *hcpu = (void *)(long)cpu; 204 void *hcpu = (void *)(long)cpu;
157 205
158 if ((ret = down_interruptible(&cpucontrol)) != 0) 206 if ((ret = lock_cpu_hotplug_interruptible()) != 0)
159 return ret; 207 return ret;
160 208
161 if (cpu_online(cpu) || !cpu_present(cpu)) { 209 if (cpu_online(cpu) || !cpu_present(cpu)) {
162 ret = -EINVAL; 210 ret = -EINVAL;
163 goto out; 211 goto out;
164 } 212 }
213
165 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 214 ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
166 if (ret == NOTIFY_BAD) { 215 if (ret == NOTIFY_BAD) {
167 printk("%s: attempt to bring up CPU %u failed\n", 216 printk("%s: attempt to bring up CPU %u failed\n",
@@ -184,6 +233,6 @@ out_notify:
184 if (ret != 0) 233 if (ret != 0)
185 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); 234 notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
186out: 235out:
187 up(&cpucontrol); 236 unlock_cpu_hotplug();
188 return ret; 237 return ret;
189} 238}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79866bc6b3a1..7430640f9816 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
32#include <linux/kernel.h> 32#include <linux/kernel.h>
33#include <linux/kmod.h> 33#include <linux/kmod.h>
34#include <linux/list.h> 34#include <linux/list.h>
35#include <linux/mempolicy.h>
35#include <linux/mm.h> 36#include <linux/mm.h>
36#include <linux/module.h> 37#include <linux/module.h>
37#include <linux/mount.h> 38#include <linux/mount.h>
@@ -60,6 +61,9 @@ struct cpuset {
60 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 61 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
61 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 62 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
62 63
64 /*
65 * Count is atomic so can incr (fork) or decr (exit) without a lock.
66 */
63 atomic_t count; /* count tasks using this cpuset */ 67 atomic_t count; /* count tasks using this cpuset */
64 68
65 /* 69 /*
@@ -142,80 +146,91 @@ static struct vfsmount *cpuset_mount;
142static struct super_block *cpuset_sb = NULL; 146static struct super_block *cpuset_sb = NULL;
143 147
144/* 148/*
145 * cpuset_sem should be held by anyone who is depending on the children 149 * We have two global cpuset semaphores below. They can nest.
146 * or sibling lists of any cpuset, or performing non-atomic operations 150 * It is ok to first take manage_sem, then nest callback_sem. We also
147 * on the flags or *_allowed values of a cpuset, such as raising the 151 * require taking task_lock() when dereferencing a tasks cpuset pointer.
148 * CS_REMOVED flag bit iff it is not already raised, or reading and 152 * See "The task_lock() exception", at the end of this comment.
149 * conditionally modifying the *_allowed values. One kernel global 153 *
150 * cpuset semaphore should be sufficient - these things don't change 154 * A task must hold both semaphores to modify cpusets. If a task
151 * that much. 155 * holds manage_sem, then it blocks others wanting that semaphore,
152 * 156 * ensuring that it is the only task able to also acquire callback_sem
153 * The code that modifies cpusets holds cpuset_sem across the entire 157 * and be able to modify cpusets. It can perform various checks on
154 * operation, from cpuset_common_file_write() down, single threading 158 * the cpuset structure first, knowing nothing will change. It can
155 * all cpuset modifications (except for counter manipulations from 159 * also allocate memory while just holding manage_sem. While it is
156 * fork and exit) across the system. This presumes that cpuset 160 * performing these checks, various callback routines can briefly
157 * modifications are rare - better kept simple and safe, even if slow. 161 * acquire callback_sem to query cpusets. Once it is ready to make
158 * 162 * the changes, it takes callback_sem, blocking everyone else.
159 * The code that reads cpusets, such as in cpuset_common_file_read() 163 *
160 * and below, only holds cpuset_sem across small pieces of code, such 164 * Calls to the kernel memory allocator can not be made while holding
161 * as when reading out possibly multi-word cpumasks and nodemasks, as 165 * callback_sem, as that would risk double tripping on callback_sem
162 * the risks are less, and the desire for performance a little greater. 166 * from one of the callbacks into the cpuset code from within
163 * The proc_cpuset_show() routine needs to hold cpuset_sem to insure 167 * __alloc_pages().
164 * that no cs->dentry is NULL, as it walks up the cpuset tree to root. 168 *
165 * 169 * If a task is only holding callback_sem, then it has read-only
166 * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't 170 * access to cpusets.
167 * (usually) grab cpuset_sem. These are the two most performance 171 *
168 * critical pieces of code here. The exception occurs on exit(), 172 * The task_struct fields mems_allowed and mems_generation may only
169 * when a task in a notify_on_release cpuset exits. Then cpuset_sem 173 * be accessed in the context of that task, so require no locks.
174 *
175 * Any task can increment and decrement the count field without lock.
176 * So in general, code holding manage_sem or callback_sem can't rely
177 * on the count field not changing. However, if the count goes to
178 * zero, then only attach_task(), which holds both semaphores, can
179 * increment it again. Because a count of zero means that no tasks
180 * are currently attached, therefore there is no way a task attached
181 * to that cpuset can fork (the other way to increment the count).
182 * So code holding manage_sem or callback_sem can safely assume that
183 * if the count is zero, it will stay zero. Similarly, if a task
184 * holds manage_sem or callback_sem on a cpuset with zero count, it
185 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
186 * both of those semaphores.
187 *
188 * A possible optimization to improve parallelism would be to make
189 * callback_sem a R/W semaphore (rwsem), allowing the callback routines
190 * to proceed in parallel, with read access, until the holder of
191 * manage_sem needed to take this rwsem for exclusive write access
192 * and modify some cpusets.
193 *
194 * The cpuset_common_file_write handler for operations that modify
195 * the cpuset hierarchy holds manage_sem across the entire operation,
196 * single threading all such cpuset modifications across the system.
197 *
198 * The cpuset_common_file_read() handlers only hold callback_sem across
199 * small pieces of code, such as when reading out possibly multi-word
200 * cpumasks and nodemasks.
201 *
202 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
203 * (usually) take either semaphore. These are the two most performance
204 * critical pieces of code here. The exception occurs on cpuset_exit(),
205 * when a task in a notify_on_release cpuset exits. Then manage_sem
170 * is taken, and if the cpuset count is zero, a usermode call made 206 * is taken, and if the cpuset count is zero, a usermode call made
171 * to /sbin/cpuset_release_agent with the name of the cpuset (path 207 * to /sbin/cpuset_release_agent with the name of the cpuset (path
172 * relative to the root of cpuset file system) as the argument. 208 * relative to the root of cpuset file system) as the argument.
173 * 209 *
174 * A cpuset can only be deleted if both its 'count' of using tasks is 210 * A cpuset can only be deleted if both its 'count' of using tasks
175 * zero, and its list of 'children' cpusets is empty. Since all tasks 211 * is zero, and its list of 'children' cpusets is empty. Since all
176 * in the system use _some_ cpuset, and since there is always at least 212 * tasks in the system use _some_ cpuset, and since there is always at
177 * one task in the system (init, pid == 1), therefore, top_cpuset 213 * least one task in the system (init, pid == 1), therefore, top_cpuset
178 * always has either children cpusets and/or using tasks. So no need 214 * always has either children cpusets and/or using tasks. So we don't
179 * for any special hack to ensure that top_cpuset cannot be deleted. 215 * need a special hack to ensure that top_cpuset cannot be deleted.
216 *
217 * The above "Tale of Two Semaphores" would be complete, but for:
218 *
219 * The task_lock() exception
220 *
221 * The need for this exception arises from the action of attach_task(),
222 * which overwrites one tasks cpuset pointer with another. It does
223 * so using both semaphores, however there are several performance
224 * critical places that need to reference task->cpuset without the
225 * expense of grabbing a system global semaphore. Therefore except as
226 * noted below, when dereferencing or, as in attach_task(), modifying
227 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
228 * (task->alloc_lock) already in the task_struct routinely used for
229 * such matters.
180 */ 230 */
181 231
182static DECLARE_MUTEX(cpuset_sem); 232static DECLARE_MUTEX(manage_sem);
183static struct task_struct *cpuset_sem_owner; 233static DECLARE_MUTEX(callback_sem);
184static int cpuset_sem_depth;
185
186/*
187 * The global cpuset semaphore cpuset_sem can be needed by the
188 * memory allocator to update a tasks mems_allowed (see the calls
189 * to cpuset_update_current_mems_allowed()) or to walk up the
190 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
191 * to cpuset_excl_nodes_overlap()).
192 *
193 * But if the memory allocation is being done by cpuset.c code, it
194 * usually already holds cpuset_sem. Double tripping on a kernel
195 * semaphore deadlocks the current task, and any other task that
196 * subsequently tries to obtain the lock.
197 *
198 * Run all up's and down's on cpuset_sem through the following
199 * wrappers, which will detect this nested locking, and avoid
200 * deadlocking.
201 */
202
203static inline void cpuset_down(struct semaphore *psem)
204{
205 if (cpuset_sem_owner != current) {
206 down(psem);
207 cpuset_sem_owner = current;
208 }
209 cpuset_sem_depth++;
210}
211
212static inline void cpuset_up(struct semaphore *psem)
213{
214 if (--cpuset_sem_depth == 0) {
215 cpuset_sem_owner = NULL;
216 up(psem);
217 }
218}
219 234
220/* 235/*
221 * A couple of forward declarations required, due to cyclic reference loop: 236 * A couple of forward declarations required, due to cyclic reference loop:
@@ -390,7 +405,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
390} 405}
391 406
392/* 407/*
393 * Call with cpuset_sem held. Writes path of cpuset into buf. 408 * Call with manage_sem held. Writes path of cpuset into buf.
394 * Returns 0 on success, -errno on error. 409 * Returns 0 on success, -errno on error.
395 */ 410 */
396 411
@@ -442,10 +457,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
442 * status of the /sbin/cpuset_release_agent task, so no sense holding 457 * status of the /sbin/cpuset_release_agent task, so no sense holding
443 * our caller up for that. 458 * our caller up for that.
444 * 459 *
445 * The simple act of forking that task might require more memory, 460 * When we had only one cpuset semaphore, we had to call this
446 * which might need cpuset_sem. So this routine must be called while 461 * without holding it, to avoid deadlock when call_usermodehelper()
447 * cpuset_sem is not held, to avoid a possible deadlock. See also 462 * allocated memory. With two locks, we could now call this while
448 * comments for check_for_release(), below. 463 * holding manage_sem, but we still don't, so as to minimize
464 * the time manage_sem is held.
449 */ 465 */
450 466
451static void cpuset_release_agent(const char *pathbuf) 467static void cpuset_release_agent(const char *pathbuf)
@@ -477,15 +493,15 @@ static void cpuset_release_agent(const char *pathbuf)
477 * cs is notify_on_release() and now both the user count is zero and 493 * cs is notify_on_release() and now both the user count is zero and
478 * the list of children is empty, prepare cpuset path in a kmalloc'd 494 * the list of children is empty, prepare cpuset path in a kmalloc'd
479 * buffer, to be returned via ppathbuf, so that the caller can invoke 495 * buffer, to be returned via ppathbuf, so that the caller can invoke
480 * cpuset_release_agent() with it later on, once cpuset_sem is dropped. 496 * cpuset_release_agent() with it later on, once manage_sem is dropped.
481 * Call here with cpuset_sem held. 497 * Call here with manage_sem held.
482 * 498 *
483 * This check_for_release() routine is responsible for kmalloc'ing 499 * This check_for_release() routine is responsible for kmalloc'ing
484 * pathbuf. The above cpuset_release_agent() is responsible for 500 * pathbuf. The above cpuset_release_agent() is responsible for
485 * kfree'ing pathbuf. The caller of these routines is responsible 501 * kfree'ing pathbuf. The caller of these routines is responsible
486 * for providing a pathbuf pointer, initialized to NULL, then 502 * for providing a pathbuf pointer, initialized to NULL, then
487 * calling check_for_release() with cpuset_sem held and the address 503 * calling check_for_release() with manage_sem held and the address
488 * of the pathbuf pointer, then dropping cpuset_sem, then calling 504 * of the pathbuf pointer, then dropping manage_sem, then calling
489 * cpuset_release_agent() with pathbuf, as set by check_for_release(). 505 * cpuset_release_agent() with pathbuf, as set by check_for_release().
490 */ 506 */
491 507
@@ -516,7 +532,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
516 * One way or another, we guarantee to return some non-empty subset 532 * One way or another, we guarantee to return some non-empty subset
517 * of cpu_online_map. 533 * of cpu_online_map.
518 * 534 *
519 * Call with cpuset_sem held. 535 * Call with callback_sem held.
520 */ 536 */
521 537
522static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask) 538static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -540,7 +556,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
540 * One way or another, we guarantee to return some non-empty subset 556 * One way or another, we guarantee to return some non-empty subset
541 * of node_online_map. 557 * of node_online_map.
542 * 558 *
543 * Call with cpuset_sem held. 559 * Call with callback_sem held.
544 */ 560 */
545 561
546static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 562static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -555,22 +571,47 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
555} 571}
556 572
557/* 573/*
558 * Refresh current tasks mems_allowed and mems_generation from 574 * Refresh current tasks mems_allowed and mems_generation from current
559 * current tasks cpuset. Call with cpuset_sem held. 575 * tasks cpuset.
576 *
577 * Call without callback_sem or task_lock() held. May be called with
578 * or without manage_sem held. Will acquire task_lock() and might
579 * acquire callback_sem during call.
580 *
581 * The task_lock() is required to dereference current->cpuset safely.
582 * Without it, we could pick up the pointer value of current->cpuset
583 * in one instruction, and then attach_task could give us a different
584 * cpuset, and then the cpuset we had could be removed and freed,
585 * and then on our next instruction, we could dereference a no longer
586 * valid cpuset pointer to get its mems_generation field.
560 * 587 *
561 * This routine is needed to update the per-task mems_allowed 588 * This routine is needed to update the per-task mems_allowed data,
562 * data, within the tasks context, when it is trying to allocate 589 * within the tasks context, when it is trying to allocate memory
563 * memory (in various mm/mempolicy.c routines) and notices 590 * (in various mm/mempolicy.c routines) and notices that some other
564 * that some other task has been modifying its cpuset. 591 * task has been modifying its cpuset.
565 */ 592 */
566 593
567static void refresh_mems(void) 594static void refresh_mems(void)
568{ 595{
569 struct cpuset *cs = current->cpuset; 596 int my_cpusets_mem_gen;
570 597
571 if (current->cpuset_mems_generation != cs->mems_generation) { 598 task_lock(current);
599 my_cpusets_mem_gen = current->cpuset->mems_generation;
600 task_unlock(current);
601
602 if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
603 struct cpuset *cs;
604 nodemask_t oldmem = current->mems_allowed;
605
606 down(&callback_sem);
607 task_lock(current);
608 cs = current->cpuset;
572 guarantee_online_mems(cs, &current->mems_allowed); 609 guarantee_online_mems(cs, &current->mems_allowed);
573 current->cpuset_mems_generation = cs->mems_generation; 610 current->cpuset_mems_generation = cs->mems_generation;
611 task_unlock(current);
612 up(&callback_sem);
613 if (!nodes_equal(oldmem, current->mems_allowed))
614 numa_policy_rebind(&oldmem, &current->mems_allowed);
574 } 615 }
575} 616}
576 617
@@ -579,7 +620,7 @@ static void refresh_mems(void)
579 * 620 *
580 * One cpuset is a subset of another if all its allowed CPUs and 621 * One cpuset is a subset of another if all its allowed CPUs and
581 * Memory Nodes are a subset of the other, and its exclusive flags 622 * Memory Nodes are a subset of the other, and its exclusive flags
582 * are only set if the other's are set. 623 * are only set if the other's are set. Call holding manage_sem.
583 */ 624 */
584 625
585static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 626static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -597,7 +638,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
597 * If we replaced the flag and mask values of the current cpuset 638 * If we replaced the flag and mask values of the current cpuset
598 * (cur) with those values in the trial cpuset (trial), would 639 * (cur) with those values in the trial cpuset (trial), would
599 * our various subset and exclusive rules still be valid? Presumes 640 * our various subset and exclusive rules still be valid? Presumes
600 * cpuset_sem held. 641 * manage_sem held.
601 * 642 *
602 * 'cur' is the address of an actual, in-use cpuset. Operations 643 * 'cur' is the address of an actual, in-use cpuset. Operations
603 * such as list traversal that depend on the actual address of the 644 * such as list traversal that depend on the actual address of the
@@ -651,7 +692,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
651 * exclusive child cpusets 692 * exclusive child cpusets
652 * Build these two partitions by calling partition_sched_domains 693 * Build these two partitions by calling partition_sched_domains
653 * 694 *
654 * Call with cpuset_sem held. May nest a call to the 695 * Call with manage_sem held. May nest a call to the
655 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 696 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
656 */ 697 */
657 698
@@ -696,6 +737,10 @@ static void update_cpu_domains(struct cpuset *cur)
696 unlock_cpu_hotplug(); 737 unlock_cpu_hotplug();
697} 738}
698 739
740/*
741 * Call with manage_sem held. May take callback_sem during call.
742 */
743
699static int update_cpumask(struct cpuset *cs, char *buf) 744static int update_cpumask(struct cpuset *cs, char *buf)
700{ 745{
701 struct cpuset trialcs; 746 struct cpuset trialcs;
@@ -712,12 +757,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
712 if (retval < 0) 757 if (retval < 0)
713 return retval; 758 return retval;
714 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); 759 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
760 down(&callback_sem);
715 cs->cpus_allowed = trialcs.cpus_allowed; 761 cs->cpus_allowed = trialcs.cpus_allowed;
762 up(&callback_sem);
716 if (is_cpu_exclusive(cs) && !cpus_unchanged) 763 if (is_cpu_exclusive(cs) && !cpus_unchanged)
717 update_cpu_domains(cs); 764 update_cpu_domains(cs);
718 return 0; 765 return 0;
719} 766}
720 767
768/*
769 * Call with manage_sem held. May take callback_sem during call.
770 */
771
721static int update_nodemask(struct cpuset *cs, char *buf) 772static int update_nodemask(struct cpuset *cs, char *buf)
722{ 773{
723 struct cpuset trialcs; 774 struct cpuset trialcs;
@@ -732,9 +783,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
732 return -ENOSPC; 783 return -ENOSPC;
733 retval = validate_change(cs, &trialcs); 784 retval = validate_change(cs, &trialcs);
734 if (retval == 0) { 785 if (retval == 0) {
786 down(&callback_sem);
735 cs->mems_allowed = trialcs.mems_allowed; 787 cs->mems_allowed = trialcs.mems_allowed;
736 atomic_inc(&cpuset_mems_generation); 788 atomic_inc(&cpuset_mems_generation);
737 cs->mems_generation = atomic_read(&cpuset_mems_generation); 789 cs->mems_generation = atomic_read(&cpuset_mems_generation);
790 up(&callback_sem);
738 } 791 }
739 return retval; 792 return retval;
740} 793}
@@ -745,6 +798,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
745 * CS_NOTIFY_ON_RELEASE) 798 * CS_NOTIFY_ON_RELEASE)
746 * cs: the cpuset to update 799 * cs: the cpuset to update
747 * buf: the buffer where we read the 0 or 1 800 * buf: the buffer where we read the 0 or 1
801 *
802 * Call with manage_sem held.
748 */ 803 */
749 804
750static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 805static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -766,16 +821,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
766 return err; 821 return err;
767 cpu_exclusive_changed = 822 cpu_exclusive_changed =
768 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); 823 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
824 down(&callback_sem);
769 if (turning_on) 825 if (turning_on)
770 set_bit(bit, &cs->flags); 826 set_bit(bit, &cs->flags);
771 else 827 else
772 clear_bit(bit, &cs->flags); 828 clear_bit(bit, &cs->flags);
829 up(&callback_sem);
773 830
774 if (cpu_exclusive_changed) 831 if (cpu_exclusive_changed)
775 update_cpu_domains(cs); 832 update_cpu_domains(cs);
776 return 0; 833 return 0;
777} 834}
778 835
836/*
837 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
838 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
839 * notified on release.
840 *
841 * Call holding manage_sem. May take callback_sem and task_lock of
842 * the task 'pid' during call.
843 */
844
779static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) 845static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
780{ 846{
781 pid_t pid; 847 pid_t pid;
@@ -792,7 +858,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
792 read_lock(&tasklist_lock); 858 read_lock(&tasklist_lock);
793 859
794 tsk = find_task_by_pid(pid); 860 tsk = find_task_by_pid(pid);
795 if (!tsk) { 861 if (!tsk || tsk->flags & PF_EXITING) {
796 read_unlock(&tasklist_lock); 862 read_unlock(&tasklist_lock);
797 return -ESRCH; 863 return -ESRCH;
798 } 864 }
@@ -810,10 +876,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
810 get_task_struct(tsk); 876 get_task_struct(tsk);
811 } 877 }
812 878
879 down(&callback_sem);
880
813 task_lock(tsk); 881 task_lock(tsk);
814 oldcs = tsk->cpuset; 882 oldcs = tsk->cpuset;
815 if (!oldcs) { 883 if (!oldcs) {
816 task_unlock(tsk); 884 task_unlock(tsk);
885 up(&callback_sem);
817 put_task_struct(tsk); 886 put_task_struct(tsk);
818 return -ESRCH; 887 return -ESRCH;
819 } 888 }
@@ -824,6 +893,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
824 guarantee_online_cpus(cs, &cpus); 893 guarantee_online_cpus(cs, &cpus);
825 set_cpus_allowed(tsk, cpus); 894 set_cpus_allowed(tsk, cpus);
826 895
896 up(&callback_sem);
827 put_task_struct(tsk); 897 put_task_struct(tsk);
828 if (atomic_dec_and_test(&oldcs->count)) 898 if (atomic_dec_and_test(&oldcs->count))
829 check_for_release(oldcs, ppathbuf); 899 check_for_release(oldcs, ppathbuf);
@@ -867,7 +937,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
867 } 937 }
868 buffer[nbytes] = 0; /* nul-terminate */ 938 buffer[nbytes] = 0; /* nul-terminate */
869 939
870 cpuset_down(&cpuset_sem); 940 down(&manage_sem);
871 941
872 if (is_removed(cs)) { 942 if (is_removed(cs)) {
873 retval = -ENODEV; 943 retval = -ENODEV;
@@ -901,7 +971,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
901 if (retval == 0) 971 if (retval == 0)
902 retval = nbytes; 972 retval = nbytes;
903out2: 973out2:
904 cpuset_up(&cpuset_sem); 974 up(&manage_sem);
905 cpuset_release_agent(pathbuf); 975 cpuset_release_agent(pathbuf);
906out1: 976out1:
907 kfree(buffer); 977 kfree(buffer);
@@ -941,9 +1011,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
941{ 1011{
942 cpumask_t mask; 1012 cpumask_t mask;
943 1013
944 cpuset_down(&cpuset_sem); 1014 down(&callback_sem);
945 mask = cs->cpus_allowed; 1015 mask = cs->cpus_allowed;
946 cpuset_up(&cpuset_sem); 1016 up(&callback_sem);
947 1017
948 return cpulist_scnprintf(page, PAGE_SIZE, mask); 1018 return cpulist_scnprintf(page, PAGE_SIZE, mask);
949} 1019}
@@ -952,9 +1022,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
952{ 1022{
953 nodemask_t mask; 1023 nodemask_t mask;
954 1024
955 cpuset_down(&cpuset_sem); 1025 down(&callback_sem);
956 mask = cs->mems_allowed; 1026 mask = cs->mems_allowed;
957 cpuset_up(&cpuset_sem); 1027 up(&callback_sem);
958 1028
959 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1029 return nodelist_scnprintf(page, PAGE_SIZE, mask);
960} 1030}
@@ -968,8 +1038,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
968 char *page; 1038 char *page;
969 ssize_t retval = 0; 1039 ssize_t retval = 0;
970 char *s; 1040 char *s;
971 char *start;
972 size_t n;
973 1041
974 if (!(page = (char *)__get_free_page(GFP_KERNEL))) 1042 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
975 return -ENOMEM; 1043 return -ENOMEM;
@@ -997,16 +1065,8 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
997 goto out; 1065 goto out;
998 } 1066 }
999 *s++ = '\n'; 1067 *s++ = '\n';
1000 *s = '\0';
1001 1068
1002 /* Do nothing if *ppos is at the eof or beyond the eof. */ 1069 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1003 if (s - page <= *ppos)
1004 return 0;
1005
1006 start = page + *ppos;
1007 n = s - start;
1008 retval = n - copy_to_user(buf, start, min(n, nbytes));
1009 *ppos += retval;
1010out: 1070out:
1011 free_page((unsigned long)page); 1071 free_page((unsigned long)page);
1012 return retval; 1072 return retval;
@@ -1057,6 +1117,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
1057 return 0; 1117 return 0;
1058} 1118}
1059 1119
1120/*
1121 * cpuset_rename - Only allow simple rename of directories in place.
1122 */
1123static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
1124 struct inode *new_dir, struct dentry *new_dentry)
1125{
1126 if (!S_ISDIR(old_dentry->d_inode->i_mode))
1127 return -ENOTDIR;
1128 if (new_dentry->d_inode)
1129 return -EEXIST;
1130 if (old_dir != new_dir)
1131 return -EIO;
1132 return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
1133}
1134
1060static struct file_operations cpuset_file_operations = { 1135static struct file_operations cpuset_file_operations = {
1061 .read = cpuset_file_read, 1136 .read = cpuset_file_read,
1062 .write = cpuset_file_write, 1137 .write = cpuset_file_write,
@@ -1069,6 +1144,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
1069 .lookup = simple_lookup, 1144 .lookup = simple_lookup,
1070 .mkdir = cpuset_mkdir, 1145 .mkdir = cpuset_mkdir,
1071 .rmdir = cpuset_rmdir, 1146 .rmdir = cpuset_rmdir,
1147 .rename = cpuset_rename,
1072}; 1148};
1073 1149
1074static int cpuset_create_file(struct dentry *dentry, int mode) 1150static int cpuset_create_file(struct dentry *dentry, int mode)
@@ -1172,7 +1248,9 @@ struct ctr_struct {
1172 1248
1173/* 1249/*
1174 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'. 1250 * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
1175 * Return actual number of pids loaded. 1251 * Return actual number of pids loaded. No need to task_lock(p)
1252 * when reading out p->cpuset, as we don't really care if it changes
1253 * on the next cycle, and we are not going to try to dereference it.
1176 */ 1254 */
1177static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1255static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1178{ 1256{
@@ -1214,6 +1292,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
1214 return cnt; 1292 return cnt;
1215} 1293}
1216 1294
1295/*
1296 * Handle an open on 'tasks' file. Prepare a buffer listing the
1297 * process id's of tasks currently attached to the cpuset being opened.
1298 *
1299 * Does not require any specific cpuset semaphores, and does not take any.
1300 */
1217static int cpuset_tasks_open(struct inode *unused, struct file *file) 1301static int cpuset_tasks_open(struct inode *unused, struct file *file)
1218{ 1302{
1219 struct cpuset *cs = __d_cs(file->f_dentry->d_parent); 1303 struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1361,7 +1445,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1361 if (!cs) 1445 if (!cs)
1362 return -ENOMEM; 1446 return -ENOMEM;
1363 1447
1364 cpuset_down(&cpuset_sem); 1448 down(&manage_sem);
1449 refresh_mems();
1365 cs->flags = 0; 1450 cs->flags = 0;
1366 if (notify_on_release(parent)) 1451 if (notify_on_release(parent))
1367 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1452 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1375,25 +1460,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1375 1460
1376 cs->parent = parent; 1461 cs->parent = parent;
1377 1462
1463 down(&callback_sem);
1378 list_add(&cs->sibling, &cs->parent->children); 1464 list_add(&cs->sibling, &cs->parent->children);
1465 up(&callback_sem);
1379 1466
1380 err = cpuset_create_dir(cs, name, mode); 1467 err = cpuset_create_dir(cs, name, mode);
1381 if (err < 0) 1468 if (err < 0)
1382 goto err; 1469 goto err;
1383 1470
1384 /* 1471 /*
1385 * Release cpuset_sem before cpuset_populate_dir() because it 1472 * Release manage_sem before cpuset_populate_dir() because it
1386 * will down() this new directory's i_sem and if we race with 1473 * will down() this new directory's i_sem and if we race with
1387 * another mkdir, we might deadlock. 1474 * another mkdir, we might deadlock.
1388 */ 1475 */
1389 cpuset_up(&cpuset_sem); 1476 up(&manage_sem);
1390 1477
1391 err = cpuset_populate_dir(cs->dentry); 1478 err = cpuset_populate_dir(cs->dentry);
1392 /* If err < 0, we have a half-filled directory - oh well ;) */ 1479 /* If err < 0, we have a half-filled directory - oh well ;) */
1393 return 0; 1480 return 0;
1394err: 1481err:
1395 list_del(&cs->sibling); 1482 list_del(&cs->sibling);
1396 cpuset_up(&cpuset_sem); 1483 up(&manage_sem);
1397 kfree(cs); 1484 kfree(cs);
1398 return err; 1485 return err;
1399} 1486}
@@ -1415,29 +1502,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1415 1502
1416 /* the vfs holds both inode->i_sem already */ 1503 /* the vfs holds both inode->i_sem already */
1417 1504
1418 cpuset_down(&cpuset_sem); 1505 down(&manage_sem);
1506 refresh_mems();
1419 if (atomic_read(&cs->count) > 0) { 1507 if (atomic_read(&cs->count) > 0) {
1420 cpuset_up(&cpuset_sem); 1508 up(&manage_sem);
1421 return -EBUSY; 1509 return -EBUSY;
1422 } 1510 }
1423 if (!list_empty(&cs->children)) { 1511 if (!list_empty(&cs->children)) {
1424 cpuset_up(&cpuset_sem); 1512 up(&manage_sem);
1425 return -EBUSY; 1513 return -EBUSY;
1426 } 1514 }
1427 parent = cs->parent; 1515 parent = cs->parent;
1516 down(&callback_sem);
1428 set_bit(CS_REMOVED, &cs->flags); 1517 set_bit(CS_REMOVED, &cs->flags);
1429 if (is_cpu_exclusive(cs)) 1518 if (is_cpu_exclusive(cs))
1430 update_cpu_domains(cs); 1519 update_cpu_domains(cs);
1431 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1520 list_del(&cs->sibling); /* delete my sibling from parent->children */
1432 if (list_empty(&parent->children))
1433 check_for_release(parent, &pathbuf);
1434 spin_lock(&cs->dentry->d_lock); 1521 spin_lock(&cs->dentry->d_lock);
1435 d = dget(cs->dentry); 1522 d = dget(cs->dentry);
1436 cs->dentry = NULL; 1523 cs->dentry = NULL;
1437 spin_unlock(&d->d_lock); 1524 spin_unlock(&d->d_lock);
1438 cpuset_d_remove_dir(d); 1525 cpuset_d_remove_dir(d);
1439 dput(d); 1526 dput(d);
1440 cpuset_up(&cpuset_sem); 1527 up(&callback_sem);
1528 if (list_empty(&parent->children))
1529 check_for_release(parent, &pathbuf);
1530 up(&manage_sem);
1441 cpuset_release_agent(pathbuf); 1531 cpuset_release_agent(pathbuf);
1442 return 0; 1532 return 0;
1443} 1533}
@@ -1497,16 +1587,26 @@ void __init cpuset_init_smp(void)
1497 * cpuset_fork - attach newly forked task to its parents cpuset. 1587 * cpuset_fork - attach newly forked task to its parents cpuset.
1498 * @tsk: pointer to task_struct of forking parent process. 1588 * @tsk: pointer to task_struct of forking parent process.
1499 * 1589 *
1500 * Description: By default, on fork, a task inherits its 1590 * Description: A task inherits its parent's cpuset at fork().
1501 * parent's cpuset. The pointer to the shared cpuset is 1591 *
1502 * automatically copied in fork.c by dup_task_struct(). 1592 * A pointer to the shared cpuset was automatically copied in fork.c
1503 * This cpuset_fork() routine need only increment the usage 1593 * by dup_task_struct(). However, we ignore that copy, since it was
1504 * counter in that cpuset. 1594 * not made under the protection of task_lock(), so might no longer be
1595 * a valid cpuset pointer. attach_task() might have already changed
1596 * current->cpuset, allowing the previously referenced cpuset to
1597 * be removed and freed. Instead, we task_lock(current) and copy
1598 * its present value of current->cpuset for our freshly forked child.
1599 *
1600 * At the point that cpuset_fork() is called, 'current' is the parent
1601 * task, and the passed argument 'child' points to the child task.
1505 **/ 1602 **/
1506 1603
1507void cpuset_fork(struct task_struct *tsk) 1604void cpuset_fork(struct task_struct *child)
1508{ 1605{
1509 atomic_inc(&tsk->cpuset->count); 1606 task_lock(current);
1607 child->cpuset = current->cpuset;
1608 atomic_inc(&child->cpuset->count);
1609 task_unlock(current);
1510} 1610}
1511 1611
1512/** 1612/**
@@ -1515,35 +1615,42 @@ void cpuset_fork(struct task_struct *tsk)
1515 * 1615 *
1516 * Description: Detach cpuset from @tsk and release it. 1616 * Description: Detach cpuset from @tsk and release it.
1517 * 1617 *
1518 * Note that cpusets marked notify_on_release force every task 1618 * Note that cpusets marked notify_on_release force every task in
1519 * in them to take the global cpuset_sem semaphore when exiting. 1619 * them to take the global manage_sem semaphore when exiting.
1520 * This could impact scaling on very large systems. Be reluctant 1620 * This could impact scaling on very large systems. Be reluctant to
1521 * to use notify_on_release cpusets where very high task exit 1621 * use notify_on_release cpusets where very high task exit scaling
1522 * scaling is required on large systems. 1622 * is required on large systems.
1523 * 1623 *
1524 * Don't even think about derefencing 'cs' after the cpuset use 1624 * Don't even think about derefencing 'cs' after the cpuset use count
1525 * count goes to zero, except inside a critical section guarded 1625 * goes to zero, except inside a critical section guarded by manage_sem
1526 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1626 * or callback_sem. Otherwise a zero cpuset use count is a license to
1527 * then a zero cpuset use count is a license to any other task to 1627 * any other task to nuke the cpuset immediately, via cpuset_rmdir().
1528 * nuke the cpuset immediately. 1628 *
1629 * This routine has to take manage_sem, not callback_sem, because
1630 * it is holding that semaphore while calling check_for_release(),
1631 * which calls kmalloc(), so can't be called holding callback__sem().
1632 *
1633 * We don't need to task_lock() this reference to tsk->cpuset,
1634 * because tsk is already marked PF_EXITING, so attach_task() won't
1635 * mess with it.
1529 **/ 1636 **/
1530 1637
1531void cpuset_exit(struct task_struct *tsk) 1638void cpuset_exit(struct task_struct *tsk)
1532{ 1639{
1533 struct cpuset *cs; 1640 struct cpuset *cs;
1534 1641
1535 task_lock(tsk); 1642 BUG_ON(!(tsk->flags & PF_EXITING));
1643
1536 cs = tsk->cpuset; 1644 cs = tsk->cpuset;
1537 tsk->cpuset = NULL; 1645 tsk->cpuset = NULL;
1538 task_unlock(tsk);
1539 1646
1540 if (notify_on_release(cs)) { 1647 if (notify_on_release(cs)) {
1541 char *pathbuf = NULL; 1648 char *pathbuf = NULL;
1542 1649
1543 cpuset_down(&cpuset_sem); 1650 down(&manage_sem);
1544 if (atomic_dec_and_test(&cs->count)) 1651 if (atomic_dec_and_test(&cs->count))
1545 check_for_release(cs, &pathbuf); 1652 check_for_release(cs, &pathbuf);
1546 cpuset_up(&cpuset_sem); 1653 up(&manage_sem);
1547 cpuset_release_agent(pathbuf); 1654 cpuset_release_agent(pathbuf);
1548 } else { 1655 } else {
1549 atomic_dec(&cs->count); 1656 atomic_dec(&cs->count);
@@ -1564,11 +1671,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1564{ 1671{
1565 cpumask_t mask; 1672 cpumask_t mask;
1566 1673
1567 cpuset_down(&cpuset_sem); 1674 down(&callback_sem);
1568 task_lock((struct task_struct *)tsk); 1675 task_lock((struct task_struct *)tsk);
1569 guarantee_online_cpus(tsk->cpuset, &mask); 1676 guarantee_online_cpus(tsk->cpuset, &mask);
1570 task_unlock((struct task_struct *)tsk); 1677 task_unlock((struct task_struct *)tsk);
1571 cpuset_up(&cpuset_sem); 1678 up(&callback_sem);
1572 1679
1573 return mask; 1680 return mask;
1574} 1681}
@@ -1584,19 +1691,28 @@ void cpuset_init_current_mems_allowed(void)
1584 * If the current tasks cpusets mems_allowed changed behind our backs, 1691 * If the current tasks cpusets mems_allowed changed behind our backs,
1585 * update current->mems_allowed and mems_generation to the new value. 1692 * update current->mems_allowed and mems_generation to the new value.
1586 * Do not call this routine if in_interrupt(). 1693 * Do not call this routine if in_interrupt().
1694 *
1695 * Call without callback_sem or task_lock() held. May be called
1696 * with or without manage_sem held. Unless exiting, it will acquire
1697 * task_lock(). Also might acquire callback_sem during call to
1698 * refresh_mems().
1587 */ 1699 */
1588 1700
1589void cpuset_update_current_mems_allowed(void) 1701void cpuset_update_current_mems_allowed(void)
1590{ 1702{
1591 struct cpuset *cs = current->cpuset; 1703 struct cpuset *cs;
1704 int need_to_refresh = 0;
1592 1705
1706 task_lock(current);
1707 cs = current->cpuset;
1593 if (!cs) 1708 if (!cs)
1594 return; /* task is exiting */ 1709 goto done;
1595 if (current->cpuset_mems_generation != cs->mems_generation) { 1710 if (current->cpuset_mems_generation != cs->mems_generation)
1596 cpuset_down(&cpuset_sem); 1711 need_to_refresh = 1;
1712done:
1713 task_unlock(current);
1714 if (need_to_refresh)
1597 refresh_mems(); 1715 refresh_mems();
1598 cpuset_up(&cpuset_sem);
1599 }
1600} 1716}
1601 1717
1602/** 1718/**
@@ -1630,7 +1746,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1630 1746
1631/* 1747/*
1632 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1748 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1633 * ancestor to the specified cpuset. Call while holding cpuset_sem. 1749 * ancestor to the specified cpuset. Call holding callback_sem.
1634 * If no ancestor is mem_exclusive (an unusual configuration), then 1750 * If no ancestor is mem_exclusive (an unusual configuration), then
1635 * returns the root cpuset. 1751 * returns the root cpuset.
1636 */ 1752 */
@@ -1657,12 +1773,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1657 * GFP_KERNEL allocations are not so marked, so can escape to the 1773 * GFP_KERNEL allocations are not so marked, so can escape to the
1658 * nearest mem_exclusive ancestor cpuset. 1774 * nearest mem_exclusive ancestor cpuset.
1659 * 1775 *
1660 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() 1776 * Scanning up parent cpusets requires callback_sem. The __alloc_pages()
1661 * routine only calls here with __GFP_HARDWALL bit _not_ set if 1777 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1662 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 1778 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1663 * mems_allowed came up empty on the first pass over the zonelist. 1779 * mems_allowed came up empty on the first pass over the zonelist.
1664 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 1780 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1665 * short of memory, might require taking the cpuset_sem semaphore. 1781 * short of memory, might require taking the callback_sem semaphore.
1666 * 1782 *
1667 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() 1783 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1668 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing 1784 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1679,7 +1795,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1679 * GFP_USER - only nodes in current tasks mems allowed ok. 1795 * GFP_USER - only nodes in current tasks mems allowed ok.
1680 **/ 1796 **/
1681 1797
1682int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) 1798int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1683{ 1799{
1684 int node; /* node that zone z is on */ 1800 int node; /* node that zone z is on */
1685 const struct cpuset *cs; /* current cpuset ancestors */ 1801 const struct cpuset *cs; /* current cpuset ancestors */
@@ -1693,15 +1809,18 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1693 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ 1809 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1694 return 0; 1810 return 0;
1695 1811
1812 if (current->flags & PF_EXITING) /* Let dying task have memory */
1813 return 1;
1814
1696 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1815 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1697 cpuset_down(&cpuset_sem); 1816 down(&callback_sem);
1698 cs = current->cpuset; 1817
1699 if (!cs) 1818 task_lock(current);
1700 goto done; /* current task exiting */ 1819 cs = nearest_exclusive_ancestor(current->cpuset);
1701 cs = nearest_exclusive_ancestor(cs); 1820 task_unlock(current);
1821
1702 allowed = node_isset(node, cs->mems_allowed); 1822 allowed = node_isset(node, cs->mems_allowed);
1703done: 1823 up(&callback_sem);
1704 cpuset_up(&cpuset_sem);
1705 return allowed; 1824 return allowed;
1706} 1825}
1707 1826
@@ -1714,7 +1833,7 @@ done:
1714 * determine if task @p's memory usage might impact the memory 1833 * determine if task @p's memory usage might impact the memory
1715 * available to the current task. 1834 * available to the current task.
1716 * 1835 *
1717 * Acquires cpuset_sem - not suitable for calling from a fast path. 1836 * Acquires callback_sem - not suitable for calling from a fast path.
1718 **/ 1837 **/
1719 1838
1720int cpuset_excl_nodes_overlap(const struct task_struct *p) 1839int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1722,18 +1841,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1722 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1841 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1723 int overlap = 0; /* do cpusets overlap? */ 1842 int overlap = 0; /* do cpusets overlap? */
1724 1843
1725 cpuset_down(&cpuset_sem); 1844 down(&callback_sem);
1726 cs1 = current->cpuset; 1845
1727 if (!cs1) 1846 task_lock(current);
1728 goto done; /* current task exiting */ 1847 if (current->flags & PF_EXITING) {
1729 cs2 = p->cpuset; 1848 task_unlock(current);
1730 if (!cs2) 1849 goto done;
1731 goto done; /* task p is exiting */ 1850 }
1732 cs1 = nearest_exclusive_ancestor(cs1); 1851 cs1 = nearest_exclusive_ancestor(current->cpuset);
1733 cs2 = nearest_exclusive_ancestor(cs2); 1852 task_unlock(current);
1853
1854 task_lock((struct task_struct *)p);
1855 if (p->flags & PF_EXITING) {
1856 task_unlock((struct task_struct *)p);
1857 goto done;
1858 }
1859 cs2 = nearest_exclusive_ancestor(p->cpuset);
1860 task_unlock((struct task_struct *)p);
1861
1734 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1862 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1735done: 1863done:
1736 cpuset_up(&cpuset_sem); 1864 up(&callback_sem);
1737 1865
1738 return overlap; 1866 return overlap;
1739} 1867}
@@ -1742,6 +1870,10 @@ done:
1742 * proc_cpuset_show() 1870 * proc_cpuset_show()
1743 * - Print tasks cpuset path into seq_file. 1871 * - Print tasks cpuset path into seq_file.
1744 * - Used for /proc/<pid>/cpuset. 1872 * - Used for /proc/<pid>/cpuset.
1873 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
1874 * doesn't really matter if tsk->cpuset changes after we read it,
1875 * and we take manage_sem, keeping attach_task() from changing it
1876 * anyway.
1745 */ 1877 */
1746 1878
1747static int proc_cpuset_show(struct seq_file *m, void *v) 1879static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1756,10 +1888,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1756 return -ENOMEM; 1888 return -ENOMEM;
1757 1889
1758 tsk = m->private; 1890 tsk = m->private;
1759 cpuset_down(&cpuset_sem); 1891 down(&manage_sem);
1760 task_lock(tsk);
1761 cs = tsk->cpuset; 1892 cs = tsk->cpuset;
1762 task_unlock(tsk);
1763 if (!cs) { 1893 if (!cs) {
1764 retval = -EINVAL; 1894 retval = -EINVAL;
1765 goto out; 1895 goto out;
@@ -1771,7 +1901,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1771 seq_puts(m, buf); 1901 seq_puts(m, buf);
1772 seq_putc(m, '\n'); 1902 seq_putc(m, '\n');
1773out: 1903out:
1774 cpuset_up(&cpuset_sem); 1904 up(&manage_sem);
1775 kfree(buf); 1905 kfree(buf);
1776 return retval; 1906 return retval;
1777} 1907}
diff --git a/kernel/exit.c b/kernel/exit.c
index ee6d8b8abef5..ee515683b92d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -28,6 +28,7 @@
28#include <linux/cpuset.h> 28#include <linux/cpuset.h>
29#include <linux/syscalls.h> 29#include <linux/syscalls.h>
30#include <linux/signal.h> 30#include <linux/signal.h>
31#include <linux/cn_proc.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/unistd.h> 34#include <asm/unistd.h>
@@ -547,7 +548,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
547 548
548 if (p->pdeath_signal) 549 if (p->pdeath_signal)
549 /* We already hold the tasklist_lock here. */ 550 /* We already hold the tasklist_lock here. */
550 group_send_sig_info(p->pdeath_signal, (void *) 0, p); 551 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
551 552
552 /* Move the child from its dying parent to the new one. */ 553 /* Move the child from its dying parent to the new one. */
553 if (unlikely(traced)) { 554 if (unlikely(traced)) {
@@ -591,8 +592,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
591 int pgrp = process_group(p); 592 int pgrp = process_group(p);
592 593
593 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { 594 if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
594 __kill_pg_info(SIGHUP, (void *)1, pgrp); 595 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
595 __kill_pg_info(SIGCONT, (void *)1, pgrp); 596 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
596 } 597 }
597 } 598 }
598} 599}
@@ -727,8 +728,8 @@ static void exit_notify(struct task_struct *tsk)
727 (t->signal->session == tsk->signal->session) && 728 (t->signal->session == tsk->signal->session) &&
728 will_become_orphaned_pgrp(process_group(tsk), tsk) && 729 will_become_orphaned_pgrp(process_group(tsk), tsk) &&
729 has_stopped_jobs(process_group(tsk))) { 730 has_stopped_jobs(process_group(tsk))) {
730 __kill_pg_info(SIGHUP, (void *)1, process_group(tsk)); 731 __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
731 __kill_pg_info(SIGCONT, (void *)1, process_group(tsk)); 732 __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
732 } 733 }
733 734
734 /* Let father know we died 735 /* Let father know we died
@@ -783,10 +784,6 @@ static void exit_notify(struct task_struct *tsk)
783 /* If the process is dead, release it - nobody will wait for it */ 784 /* If the process is dead, release it - nobody will wait for it */
784 if (state == EXIT_DEAD) 785 if (state == EXIT_DEAD)
785 release_task(tsk); 786 release_task(tsk);
786
787 /* PF_DEAD causes final put_task_struct after we schedule. */
788 preempt_disable();
789 tsk->flags |= PF_DEAD;
790} 787}
791 788
792fastcall NORET_TYPE void do_exit(long code) 789fastcall NORET_TYPE void do_exit(long code)
@@ -839,10 +836,14 @@ fastcall NORET_TYPE void do_exit(long code)
839 preempt_count()); 836 preempt_count());
840 837
841 acct_update_integrals(tsk); 838 acct_update_integrals(tsk);
842 update_mem_hiwater(tsk); 839 if (tsk->mm) {
840 update_hiwater_rss(tsk->mm);
841 update_hiwater_vm(tsk->mm);
842 }
843 group_dead = atomic_dec_and_test(&tsk->signal->live); 843 group_dead = atomic_dec_and_test(&tsk->signal->live);
844 if (group_dead) { 844 if (group_dead) {
845 del_timer_sync(&tsk->signal->real_timer); 845 del_timer_sync(&tsk->signal->real_timer);
846 exit_itimers(tsk->signal);
846 acct_process(code); 847 acct_process(code);
847 } 848 }
848 exit_mm(tsk); 849 exit_mm(tsk);
@@ -858,18 +859,23 @@ fastcall NORET_TYPE void do_exit(long code)
858 if (group_dead && tsk->signal->leader) 859 if (group_dead && tsk->signal->leader)
859 disassociate_ctty(1); 860 disassociate_ctty(1);
860 861
861 module_put(tsk->thread_info->exec_domain->module); 862 module_put(task_thread_info(tsk)->exec_domain->module);
862 if (tsk->binfmt) 863 if (tsk->binfmt)
863 module_put(tsk->binfmt->module); 864 module_put(tsk->binfmt->module);
864 865
865 tsk->exit_code = code; 866 tsk->exit_code = code;
867 proc_exit_connector(tsk);
866 exit_notify(tsk); 868 exit_notify(tsk);
867#ifdef CONFIG_NUMA 869#ifdef CONFIG_NUMA
868 mpol_free(tsk->mempolicy); 870 mpol_free(tsk->mempolicy);
869 tsk->mempolicy = NULL; 871 tsk->mempolicy = NULL;
870#endif 872#endif
871 873
872 BUG_ON(!(current->flags & PF_DEAD)); 874 /* PF_DEAD causes final put_task_struct after we schedule. */
875 preempt_disable();
876 BUG_ON(tsk->flags & PF_DEAD);
877 tsk->flags |= PF_DEAD;
878
873 schedule(); 879 schedule();
874 BUG(); 880 BUG();
875 /* Avoid "noreturn function does return". */ 881 /* Avoid "noreturn function does return". */
@@ -1203,7 +1209,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
1203 1209
1204 exit_code = p->exit_code; 1210 exit_code = p->exit_code;
1205 if (unlikely(!exit_code) || 1211 if (unlikely(!exit_code) ||
1206 unlikely(p->state > TASK_STOPPED)) 1212 unlikely(p->state & TASK_TRACED))
1207 goto bail_ref; 1213 goto bail_ref;
1208 return wait_noreap_copyout(p, pid, uid, 1214 return wait_noreap_copyout(p, pid, uid,
1209 why, (exit_code << 8) | 0x7f, 1215 why, (exit_code << 8) | 0x7f,
@@ -1379,6 +1385,15 @@ repeat:
1379 1385
1380 switch (p->state) { 1386 switch (p->state) {
1381 case TASK_TRACED: 1387 case TASK_TRACED:
1388 /*
1389 * When we hit the race with PTRACE_ATTACH,
1390 * we will not report this child. But the
1391 * race means it has not yet been moved to
1392 * our ptrace_children list, so we need to
1393 * set the flag here to avoid a spurious ECHILD
1394 * when the race happens with the only child.
1395 */
1396 flag = 1;
1382 if (!my_ptrace_child(p)) 1397 if (!my_ptrace_child(p))
1383 continue; 1398 continue;
1384 /*FALLTHROUGH*/ 1399 /*FALLTHROUGH*/
diff --git a/kernel/fork.c b/kernel/fork.c
index 533ce27f4b2c..fb8572a42297 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
42#include <linux/profile.h> 42#include <linux/profile.h>
43#include <linux/rmap.h> 43#include <linux/rmap.h>
44#include <linux/acct.h> 44#include <linux/acct.h>
45#include <linux/cn_proc.h>
45 46
46#include <asm/pgtable.h> 47#include <asm/pgtable.h>
47#include <asm/pgalloc.h> 48#include <asm/pgalloc.h>
@@ -170,10 +171,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
170 return NULL; 171 return NULL;
171 } 172 }
172 173
173 *ti = *orig->thread_info;
174 *tsk = *orig; 174 *tsk = *orig;
175 tsk->thread_info = ti; 175 tsk->thread_info = ti;
176 ti->task = tsk; 176 setup_thread_stack(tsk, orig);
177 177
178 /* One for us, one for whoever does the "release_task()" (usually parent) */ 178 /* One for us, one for whoever does the "release_task()" (usually parent) */
179 atomic_set(&tsk->usage,2); 179 atomic_set(&tsk->usage,2);
@@ -182,37 +182,37 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
182} 182}
183 183
184#ifdef CONFIG_MMU 184#ifdef CONFIG_MMU
185static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) 185static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
186{ 186{
187 struct vm_area_struct * mpnt, *tmp, **pprev; 187 struct vm_area_struct *mpnt, *tmp, **pprev;
188 struct rb_node **rb_link, *rb_parent; 188 struct rb_node **rb_link, *rb_parent;
189 int retval; 189 int retval;
190 unsigned long charge; 190 unsigned long charge;
191 struct mempolicy *pol; 191 struct mempolicy *pol;
192 192
193 down_write(&oldmm->mmap_sem); 193 down_write(&oldmm->mmap_sem);
194 flush_cache_mm(current->mm); 194 flush_cache_mm(oldmm);
195 down_write(&mm->mmap_sem);
196
195 mm->locked_vm = 0; 197 mm->locked_vm = 0;
196 mm->mmap = NULL; 198 mm->mmap = NULL;
197 mm->mmap_cache = NULL; 199 mm->mmap_cache = NULL;
198 mm->free_area_cache = oldmm->mmap_base; 200 mm->free_area_cache = oldmm->mmap_base;
199 mm->cached_hole_size = ~0UL; 201 mm->cached_hole_size = ~0UL;
200 mm->map_count = 0; 202 mm->map_count = 0;
201 set_mm_counter(mm, rss, 0);
202 set_mm_counter(mm, anon_rss, 0);
203 cpus_clear(mm->cpu_vm_mask); 203 cpus_clear(mm->cpu_vm_mask);
204 mm->mm_rb = RB_ROOT; 204 mm->mm_rb = RB_ROOT;
205 rb_link = &mm->mm_rb.rb_node; 205 rb_link = &mm->mm_rb.rb_node;
206 rb_parent = NULL; 206 rb_parent = NULL;
207 pprev = &mm->mmap; 207 pprev = &mm->mmap;
208 208
209 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { 209 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
210 struct file *file; 210 struct file *file;
211 211
212 if (mpnt->vm_flags & VM_DONTCOPY) { 212 if (mpnt->vm_flags & VM_DONTCOPY) {
213 long pages = vma_pages(mpnt); 213 long pages = vma_pages(mpnt);
214 mm->total_vm -= pages; 214 mm->total_vm -= pages;
215 __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, 215 vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
216 -pages); 216 -pages);
217 continue; 217 continue;
218 } 218 }
@@ -253,12 +253,8 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
253 } 253 }
254 254
255 /* 255 /*
256 * Link in the new vma and copy the page table entries: 256 * Link in the new vma and copy the page table entries.
257 * link in first so that swapoff can see swap entries.
258 * Note that, exceptionally, here the vma is inserted
259 * without holding mm->mmap_sem.
260 */ 257 */
261 spin_lock(&mm->page_table_lock);
262 *pprev = tmp; 258 *pprev = tmp;
263 pprev = &tmp->vm_next; 259 pprev = &tmp->vm_next;
264 260
@@ -267,8 +263,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
267 rb_parent = &tmp->vm_rb; 263 rb_parent = &tmp->vm_rb;
268 264
269 mm->map_count++; 265 mm->map_count++;
270 retval = copy_page_range(mm, current->mm, tmp); 266 retval = copy_page_range(mm, oldmm, mpnt);
271 spin_unlock(&mm->page_table_lock);
272 267
273 if (tmp->vm_ops && tmp->vm_ops->open) 268 if (tmp->vm_ops && tmp->vm_ops->open)
274 tmp->vm_ops->open(tmp); 269 tmp->vm_ops->open(tmp);
@@ -277,9 +272,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
277 goto out; 272 goto out;
278 } 273 }
279 retval = 0; 274 retval = 0;
280
281out: 275out:
282 flush_tlb_mm(current->mm); 276 up_write(&mm->mmap_sem);
277 flush_tlb_mm(oldmm);
283 up_write(&oldmm->mmap_sem); 278 up_write(&oldmm->mmap_sem);
284 return retval; 279 return retval;
285fail_nomem_policy: 280fail_nomem_policy:
@@ -323,10 +318,11 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
323 INIT_LIST_HEAD(&mm->mmlist); 318 INIT_LIST_HEAD(&mm->mmlist);
324 mm->core_waiters = 0; 319 mm->core_waiters = 0;
325 mm->nr_ptes = 0; 320 mm->nr_ptes = 0;
321 set_mm_counter(mm, file_rss, 0);
322 set_mm_counter(mm, anon_rss, 0);
326 spin_lock_init(&mm->page_table_lock); 323 spin_lock_init(&mm->page_table_lock);
327 rwlock_init(&mm->ioctx_list_lock); 324 rwlock_init(&mm->ioctx_list_lock);
328 mm->ioctx_list = NULL; 325 mm->ioctx_list = NULL;
329 mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
330 mm->free_area_cache = TASK_UNMAPPED_BASE; 326 mm->free_area_cache = TASK_UNMAPPED_BASE;
331 mm->cached_hole_size = ~0UL; 327 mm->cached_hole_size = ~0UL;
332 328
@@ -472,13 +468,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
472 if (clone_flags & CLONE_VM) { 468 if (clone_flags & CLONE_VM) {
473 atomic_inc(&oldmm->mm_users); 469 atomic_inc(&oldmm->mm_users);
474 mm = oldmm; 470 mm = oldmm;
475 /*
476 * There are cases where the PTL is held to ensure no
477 * new threads start up in user mode using an mm, which
478 * allows optimizing out ipis; the tlb_gather_mmu code
479 * is an example.
480 */
481 spin_unlock_wait(&oldmm->page_table_lock);
482 goto good_mm; 471 goto good_mm;
483 } 472 }
484 473
@@ -499,7 +488,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
499 if (retval) 488 if (retval)
500 goto free_pt; 489 goto free_pt;
501 490
502 mm->hiwater_rss = get_mm_counter(mm,rss); 491 mm->hiwater_rss = get_mm_rss(mm);
503 mm->hiwater_vm = mm->total_vm; 492 mm->hiwater_vm = mm->total_vm;
504 493
505good_mm: 494good_mm:
@@ -848,7 +837,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
848{ 837{
849 unsigned long new_flags = p->flags; 838 unsigned long new_flags = p->flags;
850 839
851 new_flags &= ~PF_SUPERPRIV; 840 new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
852 new_flags |= PF_FORKNOEXEC; 841 new_flags |= PF_FORKNOEXEC;
853 if (!(clone_flags & CLONE_PTRACE)) 842 if (!(clone_flags & CLONE_PTRACE))
854 p->ptrace = 0; 843 p->ptrace = 0;
@@ -928,7 +917,7 @@ static task_t *copy_process(unsigned long clone_flags,
928 if (nr_threads >= max_threads) 917 if (nr_threads >= max_threads)
929 goto bad_fork_cleanup_count; 918 goto bad_fork_cleanup_count;
930 919
931 if (!try_module_get(p->thread_info->exec_domain->module)) 920 if (!try_module_get(task_thread_info(p)->exec_domain->module))
932 goto bad_fork_cleanup_count; 921 goto bad_fork_cleanup_count;
933 922
934 if (p->binfmt && !try_module_get(p->binfmt->module)) 923 if (p->binfmt && !try_module_get(p->binfmt->module))
@@ -1135,8 +1124,6 @@ static task_t *copy_process(unsigned long clone_flags,
1135 if (unlikely(p->ptrace & PT_PTRACED)) 1124 if (unlikely(p->ptrace & PT_PTRACED))
1136 __ptrace_link(p, current->parent); 1125 __ptrace_link(p, current->parent);
1137 1126
1138 cpuset_fork(p);
1139
1140 attach_pid(p, PIDTYPE_PID, p->pid); 1127 attach_pid(p, PIDTYPE_PID, p->pid);
1141 attach_pid(p, PIDTYPE_TGID, p->tgid); 1128 attach_pid(p, PIDTYPE_TGID, p->tgid);
1142 if (thread_group_leader(p)) { 1129 if (thread_group_leader(p)) {
@@ -1152,6 +1139,8 @@ static task_t *copy_process(unsigned long clone_flags,
1152 nr_threads++; 1139 nr_threads++;
1153 total_forks++; 1140 total_forks++;
1154 write_unlock_irq(&tasklist_lock); 1141 write_unlock_irq(&tasklist_lock);
1142 proc_fork_connector(p);
1143 cpuset_fork(p);
1155 retval = 0; 1144 retval = 0;
1156 1145
1157fork_out: 1146fork_out:
@@ -1188,7 +1177,7 @@ bad_fork_cleanup:
1188 if (p->binfmt) 1177 if (p->binfmt)
1189 module_put(p->binfmt->module); 1178 module_put(p->binfmt->module);
1190bad_fork_cleanup_put_domain: 1179bad_fork_cleanup_put_domain:
1191 module_put(p->thread_info->exec_domain->module); 1180 module_put(task_thread_info(p)->exec_domain->module);
1192bad_fork_cleanup_count: 1181bad_fork_cleanup_count:
1193 put_group_info(p->group_info); 1182 put_group_info(p->group_info);
1194 atomic_dec(&p->user->processes); 1183 atomic_dec(&p->user->processes);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..5872e3507f35 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -201,23 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
201 * from swap. But that's a lot of code to duplicate here 201 * from swap. But that's a lot of code to duplicate here
202 * for a rare case, so we simply fetch the page. 202 * for a rare case, so we simply fetch the page.
203 */ 203 */
204
205 /*
206 * Do a quick atomic lookup first - this is the fastpath.
207 */
208 spin_lock(&current->mm->page_table_lock);
209 page = follow_page(mm, uaddr, 0);
210 if (likely(page != NULL)) {
211 key->shared.pgoff =
212 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
213 spin_unlock(&current->mm->page_table_lock);
214 return 0;
215 }
216 spin_unlock(&current->mm->page_table_lock);
217
218 /*
219 * Do it the general way.
220 */
221 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 204 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
222 if (err >= 0) { 205 if (err >= 0) {
223 key->shared.pgoff = 206 key->shared.pgoff =
@@ -367,6 +350,11 @@ retry:
367 if (bh1 != bh2) 350 if (bh1 != bh2)
368 spin_unlock(&bh2->lock); 351 spin_unlock(&bh2->lock);
369 352
353 if (unlikely(op_ret != -EFAULT)) {
354 ret = op_ret;
355 goto out;
356 }
357
370 /* futex_atomic_op_inuser needs to both read and write 358 /* futex_atomic_op_inuser needs to both read and write
371 * *(int __user *)uaddr2, but we can't modify it 359 * *(int __user *)uaddr2, but we can't modify it
372 * non-atomically. Therefore, if get_user below is not 360 * non-atomically. Therefore, if get_user below is not
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3ff7b925c387..51df337b37db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
117 /* 117 /*
118 * No locking required for CPU-local interrupts: 118 * No locking required for CPU-local interrupts:
119 */ 119 */
120 desc->handler->ack(irq); 120 if (desc->handler->ack)
121 desc->handler->ack(irq);
121 action_ret = handle_IRQ_event(irq, regs, desc->action); 122 action_ret = handle_IRQ_event(irq, regs, desc->action);
122 desc->handler->end(irq); 123 desc->handler->end(irq);
123 return 1; 124 return 1;
124 } 125 }
125 126
126 spin_lock(&desc->lock); 127 spin_lock(&desc->lock);
127 desc->handler->ack(irq); 128 if (desc->handler->ack)
129 desc->handler->ack(irq);
128 /* 130 /*
129 * REPLAY is when Linux resends an IRQ that was dropped earlier 131 * REPLAY is when Linux resends an IRQ that was dropped earlier
130 * WAITING is used by probe to mark irqs that are being tested 132 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1cfdb08ddf20..81c49a4d679e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
24 24
25/** 25/**
26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
27 * @irq: interrupt number to wait for
27 * 28 *
28 * This function waits for any pending IRQ handlers for this interrupt 29 * This function waits for any pending IRQ handlers for this interrupt
29 * to complete before returning. If you use this function while 30 * to complete before returning. If you use this function while
@@ -35,6 +36,9 @@ void synchronize_irq(unsigned int irq)
35{ 36{
36 struct irq_desc *desc = irq_desc + irq; 37 struct irq_desc *desc = irq_desc + irq;
37 38
39 if (irq >= NR_IRQS)
40 return;
41
38 while (desc->status & IRQ_INPROGRESS) 42 while (desc->status & IRQ_INPROGRESS)
39 cpu_relax(); 43 cpu_relax();
40} 44}
@@ -59,6 +63,9 @@ void disable_irq_nosync(unsigned int irq)
59 irq_desc_t *desc = irq_desc + irq; 63 irq_desc_t *desc = irq_desc + irq;
60 unsigned long flags; 64 unsigned long flags;
61 65
66 if (irq >= NR_IRQS)
67 return;
68
62 spin_lock_irqsave(&desc->lock, flags); 69 spin_lock_irqsave(&desc->lock, flags);
63 if (!desc->depth++) { 70 if (!desc->depth++) {
64 desc->status |= IRQ_DISABLED; 71 desc->status |= IRQ_DISABLED;
@@ -85,6 +92,9 @@ void disable_irq(unsigned int irq)
85{ 92{
86 irq_desc_t *desc = irq_desc + irq; 93 irq_desc_t *desc = irq_desc + irq;
87 94
95 if (irq >= NR_IRQS)
96 return;
97
88 disable_irq_nosync(irq); 98 disable_irq_nosync(irq);
89 if (desc->action) 99 if (desc->action)
90 synchronize_irq(irq); 100 synchronize_irq(irq);
@@ -107,6 +117,9 @@ void enable_irq(unsigned int irq)
107 irq_desc_t *desc = irq_desc + irq; 117 irq_desc_t *desc = irq_desc + irq;
108 unsigned long flags; 118 unsigned long flags;
109 119
120 if (irq >= NR_IRQS)
121 return;
122
110 spin_lock_irqsave(&desc->lock, flags); 123 spin_lock_irqsave(&desc->lock, flags);
111 switch (desc->depth) { 124 switch (desc->depth) {
112 case 0: 125 case 0:
@@ -162,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new)
162 unsigned long flags; 175 unsigned long flags;
163 int shared = 0; 176 int shared = 0;
164 177
178 if (irq >= NR_IRQS)
179 return -EINVAL;
180
165 if (desc->handler == &no_irq_type) 181 if (desc->handler == &no_irq_type)
166 return -ENOSYS; 182 return -ENOSYS;
167 /* 183 /*
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/err.h> 19#include <linux/err.h>
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */
21#include <linux/mm.h> 22#include <linux/mm.h>
22 23
23#include <asm/sections.h> 24#include <asm/sections.h>
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
90static int kimage_is_destination_range(struct kimage *image, 90static int kimage_is_destination_range(struct kimage *image,
91 unsigned long start, unsigned long end); 91 unsigned long start, unsigned long end);
92static struct page *kimage_alloc_page(struct kimage *image, 92static struct page *kimage_alloc_page(struct kimage *image,
93 unsigned int gfp_mask, 93 gfp_t gfp_mask,
94 unsigned long dest); 94 unsigned long dest);
95 95
96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 96static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
326 return 0; 326 return 0;
327} 327}
328 328
329static struct page *kimage_alloc_pages(unsigned int gfp_mask, 329static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
330 unsigned int order)
331{ 330{
332 struct page *pages; 331 struct page *pages;
333 332
@@ -335,7 +334,7 @@ static struct page *kimage_alloc_pages(unsigned int gfp_mask,
335 if (pages) { 334 if (pages) {
336 unsigned int count, i; 335 unsigned int count, i;
337 pages->mapping = NULL; 336 pages->mapping = NULL;
338 pages->private = order; 337 set_page_private(pages, order);
339 count = 1 << order; 338 count = 1 << order;
340 for (i = 0; i < count; i++) 339 for (i = 0; i < count; i++)
341 SetPageReserved(pages + i); 340 SetPageReserved(pages + i);
@@ -348,7 +347,7 @@ static void kimage_free_pages(struct page *page)
348{ 347{
349 unsigned int order, count, i; 348 unsigned int order, count, i;
350 349
351 order = page->private; 350 order = page_private(page);
352 count = 1 << order; 351 count = 1 << order;
353 for (i = 0; i < count; i++) 352 for (i = 0; i < count; i++)
354 ClearPageReserved(page + i); 353 ClearPageReserved(page + i);
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
654} 653}
655 654
656static struct page *kimage_alloc_page(struct kimage *image, 655static struct page *kimage_alloc_page(struct kimage *image,
657 unsigned int gfp_mask, 656 gfp_t gfp_mask,
658 unsigned long destination) 657 unsigned long destination)
659{ 658{
660 /* 659 /*
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
36 * struct kfifo with kfree(). 36 * struct kfifo with kfree().
37 */ 37 */
38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 38struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
39 unsigned int __nocast gfp_mask, spinlock_t *lock) 39 gfp_t gfp_mask, spinlock_t *lock)
40{ 40{
41 struct kfifo *fifo; 41 struct kfifo *fifo;
42 42
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
64 * 64 *
65 * The size will be rounded-up to a power of 2. 65 * The size will be rounded-up to a power of 2.
66 */ 66 */
67struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock) 67struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
68{ 68{
69 unsigned char *buffer; 69 unsigned char *buffer;
70 struct kfifo *ret; 70 struct kfifo *ret;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
131static int ____call_usermodehelper(void *data) 131static int ____call_usermodehelper(void *data)
132{ 132{
133 struct subprocess_info *sub_info = data; 133 struct subprocess_info *sub_info = data;
134 struct key *old_session; 134 struct key *new_session, *old_session;
135 int retval; 135 int retval;
136 136
137 /* Unblock all signals and set the session keyring. */ 137 /* Unblock all signals and set the session keyring. */
138 key_get(sub_info->ring); 138 new_session = key_get(sub_info->ring);
139 flush_signals(current); 139 flush_signals(current);
140 spin_lock_irq(&current->sighand->siglock); 140 spin_lock_irq(&current->sighand->siglock);
141 old_session = __install_session_keyring(current, sub_info->ring); 141 old_session = __install_session_keyring(current, new_session);
142 flush_signal_handlers(current, 1); 142 flush_signal_handlers(current, 1);
143 sigemptyset(&current->blocked); 143 sigemptyset(&current->blocked);
144 recalc_sigpending(); 144 recalc_sigpending();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44d..5beda378cc75 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,9 +32,9 @@
32 * <prasanna@in.ibm.com> added function-return probes. 32 * <prasanna@in.ibm.com> added function-return probes.
33 */ 33 */
34#include <linux/kprobes.h> 34#include <linux/kprobes.h>
35#include <linux/spinlock.h>
36#include <linux/hash.h> 35#include <linux/hash.h>
37#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/slab.h>
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <asm-generic/sections.h> 40#include <asm-generic/sections.h>
@@ -48,9 +48,9 @@
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50 50
51unsigned int kprobe_cpu = NR_CPUS; 51static DEFINE_SPINLOCK(kprobe_lock); /* Protects kprobe_table */
52static DEFINE_SPINLOCK(kprobe_lock); 52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static struct kprobe *curr_kprobe; 53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 54
55/* 55/*
56 * kprobe->ainsn.insn points to the copy of the instruction to be 56 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -152,50 +152,31 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
152 } 152 }
153} 153}
154 154
155/* Locks kprobe: irqs must be disabled */ 155/* We have preemption disabled.. so it is safe to use __ versions */
156void __kprobes lock_kprobes(void) 156static inline void set_kprobe_instance(struct kprobe *kp)
157{ 157{
158 unsigned long flags = 0; 158 __get_cpu_var(kprobe_instance) = kp;
159
160 /* Avoiding local interrupts to happen right after we take the kprobe_lock
161 * and before we get a chance to update kprobe_cpu, this to prevent
162 * deadlock when we have a kprobe on ISR routine and a kprobe on task
163 * routine
164 */
165 local_irq_save(flags);
166
167 spin_lock(&kprobe_lock);
168 kprobe_cpu = smp_processor_id();
169
170 local_irq_restore(flags);
171} 159}
172 160
173void __kprobes unlock_kprobes(void) 161static inline void reset_kprobe_instance(void)
174{ 162{
175 unsigned long flags = 0; 163 __get_cpu_var(kprobe_instance) = NULL;
176
177 /* Avoiding local interrupts to happen right after we update
178 * kprobe_cpu and before we get a a chance to release kprobe_lock,
179 * this to prevent deadlock when we have a kprobe on ISR routine and
180 * a kprobe on task routine
181 */
182 local_irq_save(flags);
183
184 kprobe_cpu = NR_CPUS;
185 spin_unlock(&kprobe_lock);
186
187 local_irq_restore(flags);
188} 164}
189 165
190/* You have to be holding the kprobe_lock */ 166/*
167 * This routine is called either:
168 * - under the kprobe_lock spinlock - during kprobe_[un]register()
169 * OR
170 * - with preemption disabled - from arch/xxx/kernel/kprobes.c
171 */
191struct kprobe __kprobes *get_kprobe(void *addr) 172struct kprobe __kprobes *get_kprobe(void *addr)
192{ 173{
193 struct hlist_head *head; 174 struct hlist_head *head;
194 struct hlist_node *node; 175 struct hlist_node *node;
176 struct kprobe *p;
195 177
196 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; 178 head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
197 hlist_for_each(node, head) { 179 hlist_for_each_entry_rcu(p, node, head, hlist) {
198 struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
199 if (p->addr == addr) 180 if (p->addr == addr)
200 return p; 181 return p;
201 } 182 }
@@ -210,13 +191,13 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
210{ 191{
211 struct kprobe *kp; 192 struct kprobe *kp;
212 193
213 list_for_each_entry(kp, &p->list, list) { 194 list_for_each_entry_rcu(kp, &p->list, list) {
214 if (kp->pre_handler) { 195 if (kp->pre_handler) {
215 curr_kprobe = kp; 196 set_kprobe_instance(kp);
216 if (kp->pre_handler(kp, regs)) 197 if (kp->pre_handler(kp, regs))
217 return 1; 198 return 1;
218 } 199 }
219 curr_kprobe = NULL; 200 reset_kprobe_instance();
220 } 201 }
221 return 0; 202 return 0;
222} 203}
@@ -226,11 +207,11 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
226{ 207{
227 struct kprobe *kp; 208 struct kprobe *kp;
228 209
229 list_for_each_entry(kp, &p->list, list) { 210 list_for_each_entry_rcu(kp, &p->list, list) {
230 if (kp->post_handler) { 211 if (kp->post_handler) {
231 curr_kprobe = kp; 212 set_kprobe_instance(kp);
232 kp->post_handler(kp, regs, flags); 213 kp->post_handler(kp, regs, flags);
233 curr_kprobe = NULL; 214 reset_kprobe_instance();
234 } 215 }
235 } 216 }
236 return; 217 return;
@@ -239,12 +220,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
239static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 220static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
240 int trapnr) 221 int trapnr)
241{ 222{
223 struct kprobe *cur = __get_cpu_var(kprobe_instance);
224
242 /* 225 /*
243 * if we faulted "during" the execution of a user specified 226 * if we faulted "during" the execution of a user specified
244 * probe handler, invoke just that probe's fault handler 227 * probe handler, invoke just that probe's fault handler
245 */ 228 */
246 if (curr_kprobe && curr_kprobe->fault_handler) { 229 if (cur && cur->fault_handler) {
247 if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr)) 230 if (cur->fault_handler(cur, regs, trapnr))
248 return 1; 231 return 1;
249 } 232 }
250 return 0; 233 return 0;
@@ -252,17 +235,18 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
252 235
253static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 236static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
254{ 237{
255 struct kprobe *kp = curr_kprobe; 238 struct kprobe *cur = __get_cpu_var(kprobe_instance);
256 if (curr_kprobe && kp->break_handler) { 239 int ret = 0;
257 if (kp->break_handler(kp, regs)) { 240
258 curr_kprobe = NULL; 241 if (cur && cur->break_handler) {
259 return 1; 242 if (cur->break_handler(cur, regs))
260 } 243 ret = 1;
261 } 244 }
262 curr_kprobe = NULL; 245 reset_kprobe_instance();
263 return 0; 246 return ret;
264} 247}
265 248
249/* Called with kretprobe_lock held */
266struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) 250struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
267{ 251{
268 struct hlist_node *node; 252 struct hlist_node *node;
@@ -272,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
272 return NULL; 256 return NULL;
273} 257}
274 258
259/* Called with kretprobe_lock held */
275static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe 260static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
276 *rp) 261 *rp)
277{ 262{
@@ -282,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
282 return NULL; 267 return NULL;
283} 268}
284 269
270/* Called with kretprobe_lock held */
285void __kprobes add_rp_inst(struct kretprobe_instance *ri) 271void __kprobes add_rp_inst(struct kretprobe_instance *ri)
286{ 272{
287 /* 273 /*
@@ -300,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
300 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 286 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
301} 287}
302 288
289/* Called with kretprobe_lock held */
303void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) 290void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
304{ 291{
305 /* remove rp inst off the rprobe_inst_table */ 292 /* remove rp inst off the rprobe_inst_table */
@@ -333,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
333 struct hlist_node *node, *tmp; 320 struct hlist_node *node, *tmp;
334 unsigned long flags = 0; 321 unsigned long flags = 0;
335 322
336 spin_lock_irqsave(&kprobe_lock, flags); 323 spin_lock_irqsave(&kretprobe_lock, flags);
337 head = kretprobe_inst_table_head(current); 324 head = kretprobe_inst_table_head(current);
338 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 325 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
339 if (ri->task == tk) 326 if (ri->task == tk)
340 recycle_rp_inst(ri); 327 recycle_rp_inst(ri);
341 } 328 }
342 spin_unlock_irqrestore(&kprobe_lock, flags); 329 spin_unlock_irqrestore(&kretprobe_lock, flags);
343} 330}
344 331
345/* 332/*
@@ -350,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
350 struct pt_regs *regs) 337 struct pt_regs *regs)
351{ 338{
352 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 339 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
340 unsigned long flags = 0;
353 341
354 /*TODO: consider to only swap the RA after the last pre_handler fired */ 342 /*TODO: consider to only swap the RA after the last pre_handler fired */
343 spin_lock_irqsave(&kretprobe_lock, flags);
355 arch_prepare_kretprobe(rp, regs); 344 arch_prepare_kretprobe(rp, regs);
345 spin_unlock_irqrestore(&kretprobe_lock, flags);
356 return 0; 346 return 0;
357} 347}
358 348
@@ -383,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
383 struct kprobe *kp; 373 struct kprobe *kp;
384 374
385 if (p->break_handler) { 375 if (p->break_handler) {
386 list_for_each_entry(kp, &old_p->list, list) { 376 list_for_each_entry_rcu(kp, &old_p->list, list) {
387 if (kp->break_handler) 377 if (kp->break_handler)
388 return -EEXIST; 378 return -EEXIST;
389 } 379 }
390 list_add_tail(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
391 } else 381 } else
392 list_add(&p->list, &old_p->list); 382 list_add_rcu(&p->list, &old_p->list);
393 return 0; 383 return 0;
394} 384}
395 385
@@ -407,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
407 ap->break_handler = aggr_break_handler; 397 ap->break_handler = aggr_break_handler;
408 398
409 INIT_LIST_HEAD(&ap->list); 399 INIT_LIST_HEAD(&ap->list);
410 list_add(&p->list, &ap->list); 400 list_add_rcu(&p->list, &ap->list);
411 401
412 INIT_HLIST_NODE(&ap->hlist); 402 INIT_HLIST_NODE(&ap->hlist);
413 hlist_del(&p->hlist); 403 hlist_del_rcu(&p->hlist);
414 hlist_add_head(&ap->hlist, 404 hlist_add_head_rcu(&ap->hlist,
415 &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); 405 &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
416} 406}
417 407
418/* 408/*
419 * This is the second or subsequent kprobe at the address - handle 409 * This is the second or subsequent kprobe at the address - handle
420 * the intricacies 410 * the intricacies
421 * TODO: Move kcalloc outside the spinlock 411 * TODO: Move kcalloc outside the spin_lock
422 */ 412 */
423static int __kprobes register_aggr_kprobe(struct kprobe *old_p, 413static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
424 struct kprobe *p) 414 struct kprobe *p)
@@ -444,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
444static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags) 434static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
445{ 435{
446 arch_disarm_kprobe(p); 436 arch_disarm_kprobe(p);
447 hlist_del(&p->hlist); 437 hlist_del_rcu(&p->hlist);
448 spin_unlock_irqrestore(&kprobe_lock, flags); 438 spin_unlock_irqrestore(&kprobe_lock, flags);
449 arch_remove_kprobe(p); 439 arch_remove_kprobe(p);
450} 440}
@@ -452,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
452static inline void cleanup_aggr_kprobe(struct kprobe *old_p, 442static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
453 struct kprobe *p, unsigned long flags) 443 struct kprobe *p, unsigned long flags)
454{ 444{
455 list_del(&p->list); 445 list_del_rcu(&p->list);
456 if (list_empty(&old_p->list)) { 446 if (list_empty(&old_p->list))
457 cleanup_kprobe(old_p, flags); 447 cleanup_kprobe(old_p, flags);
458 kfree(old_p); 448 else
459 } else
460 spin_unlock_irqrestore(&kprobe_lock, flags); 449 spin_unlock_irqrestore(&kprobe_lock, flags);
461} 450}
462 451
@@ -479,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p)
479 if ((ret = arch_prepare_kprobe(p)) != 0) 468 if ((ret = arch_prepare_kprobe(p)) != 0)
480 goto rm_kprobe; 469 goto rm_kprobe;
481 470
471 p->nmissed = 0;
482 spin_lock_irqsave(&kprobe_lock, flags); 472 spin_lock_irqsave(&kprobe_lock, flags);
483 old_p = get_kprobe(p->addr); 473 old_p = get_kprobe(p->addr);
484 p->nmissed = 0;
485 if (old_p) { 474 if (old_p) {
486 ret = register_aggr_kprobe(old_p, p); 475 ret = register_aggr_kprobe(old_p, p);
487 goto out; 476 goto out;
@@ -489,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p)
489 478
490 arch_copy_kprobe(p); 479 arch_copy_kprobe(p);
491 INIT_HLIST_NODE(&p->hlist); 480 INIT_HLIST_NODE(&p->hlist);
492 hlist_add_head(&p->hlist, 481 hlist_add_head_rcu(&p->hlist,
493 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 482 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
494 483
495 arch_arm_kprobe(p); 484 arch_arm_kprobe(p);
@@ -510,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
510 spin_lock_irqsave(&kprobe_lock, flags); 499 spin_lock_irqsave(&kprobe_lock, flags);
511 old_p = get_kprobe(p->addr); 500 old_p = get_kprobe(p->addr);
512 if (old_p) { 501 if (old_p) {
502 /* cleanup_*_kprobe() does the spin_unlock_irqrestore */
513 if (old_p->pre_handler == aggr_pre_handler) 503 if (old_p->pre_handler == aggr_pre_handler)
514 cleanup_aggr_kprobe(old_p, p, flags); 504 cleanup_aggr_kprobe(old_p, p, flags);
515 else 505 else
516 cleanup_kprobe(p, flags); 506 cleanup_kprobe(p, flags);
507
508 synchronize_sched();
509 if (old_p->pre_handler == aggr_pre_handler &&
510 list_empty(&old_p->list))
511 kfree(old_p);
517 } else 512 } else
518 spin_unlock_irqrestore(&kprobe_lock, flags); 513 spin_unlock_irqrestore(&kprobe_lock, flags);
519} 514}
@@ -590,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
590 585
591 unregister_kprobe(&rp->kp); 586 unregister_kprobe(&rp->kp);
592 /* No race here */ 587 /* No race here */
593 spin_lock_irqsave(&kprobe_lock, flags); 588 spin_lock_irqsave(&kretprobe_lock, flags);
594 free_rp_inst(rp); 589 free_rp_inst(rp);
595 while ((ri = get_used_rp_inst(rp)) != NULL) { 590 while ((ri = get_used_rp_inst(rp)) != NULL) {
596 ri->rp = NULL; 591 ri->rp = NULL;
597 hlist_del(&ri->uflist); 592 hlist_del(&ri->uflist);
598 } 593 }
599 spin_unlock_irqrestore(&kprobe_lock, flags); 594 spin_unlock_irqrestore(&kretprobe_lock, flags);
600} 595}
601 596
602static int __init init_kprobes(void) 597static int __init init_kprobes(void)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -165,6 +165,12 @@ EXPORT_SYMBOL(kthread_bind);
165 165
166int kthread_stop(struct task_struct *k) 166int kthread_stop(struct task_struct *k)
167{ 167{
168 return kthread_stop_sem(k, NULL);
169}
170EXPORT_SYMBOL(kthread_stop);
171
172int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
173{
168 int ret; 174 int ret;
169 175
170 down(&kthread_stop_lock); 176 down(&kthread_stop_lock);
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
178 184
179 /* Now set kthread_should_stop() to true, and wake it up. */ 185 /* Now set kthread_should_stop() to true, and wake it up. */
180 kthread_stop_info.k = k; 186 kthread_stop_info.k = k;
181 wake_up_process(k); 187 if (s)
188 up(s);
189 else
190 wake_up_process(k);
182 put_task_struct(k); 191 put_task_struct(k);
183 192
184 /* Once it dies, reset stop ptr, gather result and we're done. */ 193 /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
189 198
190 return ret; 199 return ret;
191} 200}
192EXPORT_SYMBOL(kthread_stop); 201EXPORT_SYMBOL(kthread_stop_sem);
193 202
194static __init int helper_init(void) 203static __init int helper_init(void)
195{ 204{
diff --git a/kernel/module.c b/kernel/module.c
index ff5c500ab625..2ea929d51ad0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -37,6 +37,7 @@
37#include <linux/stop_machine.h> 37#include <linux/stop_machine.h>
38#include <linux/device.h> 38#include <linux/device.h>
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/sched.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/semaphore.h> 42#include <asm/semaphore.h>
42#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
diff --git a/kernel/params.c b/kernel/params.c
index fbf173215fd2..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/device.h> 24#include <linux/device.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h>
26 27
27#if 0 28#if 0
28#define DEBUGP printk 29#define DEBUGP printk
@@ -80,8 +81,6 @@ static char *next_arg(char *args, char **param, char **val)
80 int in_quote = 0, quoted = 0; 81 int in_quote = 0, quoted = 0;
81 char *next; 82 char *next;
82 83
83 /* Chew any extra spaces */
84 while (*args == ' ') args++;
85 if (*args == '"') { 84 if (*args == '"') {
86 args++; 85 args++;
87 in_quote = 1; 86 in_quote = 1;
@@ -121,6 +120,10 @@ static char *next_arg(char *args, char **param, char **val)
121 next = args + i + 1; 120 next = args + i + 1;
122 } else 121 } else
123 next = args + i; 122 next = args + i;
123
124 /* Chew up trailing spaces. */
125 while (*next == ' ')
126 next++;
124 return next; 127 return next;
125} 128}
126 129
@@ -135,6 +138,10 @@ int parse_args(const char *name,
135 138
136 DEBUGP("Parsing ARGS: %s\n", args); 139 DEBUGP("Parsing ARGS: %s\n", args);
137 140
141 /* Chew leading spaces */
142 while (*args == ' ')
143 args++;
144
138 while (*args) { 145 while (*args) {
139 int ret; 146 int ret;
140 147
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..cae4f5728997 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
36 union cpu_time_count ret; 36 union cpu_time_count ret;
37 ret.sched = 0; /* high half always zero when .cpu used */ 37 ret.sched = 0; /* high half always zero when .cpu used */
38 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { 38 if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
39 ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; 39 ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
40 } else { 40 } else {
41 ret.cpu = timespec_to_cputime(tp); 41 ret.cpu = timespec_to_cputime(tp);
42 } 42 }
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
91 * Update expiry time from increment, and increase overrun count, 91 * Update expiry time from increment, and increase overrun count,
92 * given the current clock sample. 92 * given the current clock sample.
93 */ 93 */
94static inline void bump_cpu_timer(struct k_itimer *timer, 94static void bump_cpu_timer(struct k_itimer *timer,
95 union cpu_time_count now) 95 union cpu_time_count now)
96{ 96{
97 int i; 97 int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
110 for (i = 0; incr < delta - incr; i++) 110 for (i = 0; incr < delta - incr; i++)
111 incr = incr << 1; 111 incr = incr << 1;
112 for (; i >= 0; incr >>= 1, i--) { 112 for (; i >= 0; incr >>= 1, i--) {
113 if (delta <= incr) 113 if (delta < incr)
114 continue; 114 continue;
115 timer->it.cpu.expires.sched += incr; 115 timer->it.cpu.expires.sched += incr;
116 timer->it_overrun += 1 << i; 116 timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
128 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) 128 for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
129 incr = cputime_add(incr, incr); 129 incr = cputime_add(incr, incr);
130 for (; i >= 0; incr = cputime_halve(incr), i--) { 130 for (; i >= 0; incr = cputime_halve(incr), i--) {
131 if (cputime_le(delta, incr)) 131 if (cputime_lt(delta, incr))
132 continue; 132 continue;
133 timer->it.cpu.expires.cpu = 133 timer->it.cpu.expires.cpu =
134 cputime_add(timer->it.cpu.expires.cpu, incr); 134 cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
380int posix_cpu_timer_del(struct k_itimer *timer) 380int posix_cpu_timer_del(struct k_itimer *timer)
381{ 381{
382 struct task_struct *p = timer->it.cpu.task; 382 struct task_struct *p = timer->it.cpu.task;
383 int ret = 0;
383 384
384 if (timer->it.cpu.firing) 385 if (likely(p != NULL)) {
385 return TIMER_RETRY;
386
387 if (unlikely(p == NULL))
388 return 0;
389
390 if (!list_empty(&timer->it.cpu.entry)) {
391 read_lock(&tasklist_lock); 386 read_lock(&tasklist_lock);
392 if (unlikely(p->signal == NULL)) { 387 if (unlikely(p->signal == NULL)) {
393 /* 388 /*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
396 */ 391 */
397 BUG_ON(!list_empty(&timer->it.cpu.entry)); 392 BUG_ON(!list_empty(&timer->it.cpu.entry));
398 } else { 393 } else {
399 /*
400 * Take us off the task's timer list.
401 */
402 spin_lock(&p->sighand->siglock); 394 spin_lock(&p->sighand->siglock);
403 list_del(&timer->it.cpu.entry); 395 if (timer->it.cpu.firing)
396 ret = TIMER_RETRY;
397 else
398 list_del(&timer->it.cpu.entry);
404 spin_unlock(&p->sighand->siglock); 399 spin_unlock(&p->sighand->siglock);
405 } 400 }
406 read_unlock(&tasklist_lock); 401 read_unlock(&tasklist_lock);
402
403 if (!ret)
404 put_task_struct(p);
407 } 405 }
408 put_task_struct(p);
409 406
410 return 0; 407 return ret;
411} 408}
412 409
413/* 410/*
@@ -424,7 +421,6 @@ static void cleanup_timers(struct list_head *head,
424 cputime_t ptime = cputime_add(utime, stime); 421 cputime_t ptime = cputime_add(utime, stime);
425 422
426 list_for_each_entry_safe(timer, next, head, entry) { 423 list_for_each_entry_safe(timer, next, head, entry) {
427 timer->task = NULL;
428 list_del_init(&timer->entry); 424 list_del_init(&timer->entry);
429 if (cputime_lt(timer->expires.cpu, ptime)) { 425 if (cputime_lt(timer->expires.cpu, ptime)) {
430 timer->expires.cpu = cputime_zero; 426 timer->expires.cpu = cputime_zero;
@@ -436,7 +432,6 @@ static void cleanup_timers(struct list_head *head,
436 432
437 ++head; 433 ++head;
438 list_for_each_entry_safe(timer, next, head, entry) { 434 list_for_each_entry_safe(timer, next, head, entry) {
439 timer->task = NULL;
440 list_del_init(&timer->entry); 435 list_del_init(&timer->entry);
441 if (cputime_lt(timer->expires.cpu, utime)) { 436 if (cputime_lt(timer->expires.cpu, utime)) {
442 timer->expires.cpu = cputime_zero; 437 timer->expires.cpu = cputime_zero;
@@ -448,7 +443,6 @@ static void cleanup_timers(struct list_head *head,
448 443
449 ++head; 444 ++head;
450 list_for_each_entry_safe(timer, next, head, entry) { 445 list_for_each_entry_safe(timer, next, head, entry) {
451 timer->task = NULL;
452 list_del_init(&timer->entry); 446 list_del_init(&timer->entry);
453 if (timer->expires.sched < sched_time) { 447 if (timer->expires.sched < sched_time) {
454 timer->expires.sched = 0; 448 timer->expires.sched = 0;
@@ -492,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
492 struct task_struct *t = p; 486 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live); 487 unsigned int nthreads = atomic_read(&p->signal->live);
494 488
489 if (!nthreads)
490 return;
491
495 switch (clock_idx) { 492 switch (clock_idx) {
496 default: 493 default:
497 BUG(); 494 BUG();
@@ -500,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
500 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 497 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
501 nthreads); 498 nthreads);
502 do { 499 do {
503 if (!unlikely(t->exit_state)) { 500 if (likely(!(t->flags & PF_EXITING))) {
504 ticks = cputime_add(prof_ticks(t), left); 501 ticks = cputime_add(prof_ticks(t), left);
505 if (cputime_eq(t->it_prof_expires, 502 if (cputime_eq(t->it_prof_expires,
506 cputime_zero) || 503 cputime_zero) ||
@@ -515,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
515 left = cputime_div(cputime_sub(expires.cpu, val.cpu), 512 left = cputime_div(cputime_sub(expires.cpu, val.cpu),
516 nthreads); 513 nthreads);
517 do { 514 do {
518 if (!unlikely(t->exit_state)) { 515 if (likely(!(t->flags & PF_EXITING))) {
519 ticks = cputime_add(virt_ticks(t), left); 516 ticks = cputime_add(virt_ticks(t), left);
520 if (cputime_eq(t->it_virt_expires, 517 if (cputime_eq(t->it_virt_expires,
521 cputime_zero) || 518 cputime_zero) ||
@@ -530,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
530 nsleft = expires.sched - val.sched; 527 nsleft = expires.sched - val.sched;
531 do_div(nsleft, nthreads); 528 do_div(nsleft, nthreads);
532 do { 529 do {
533 if (!unlikely(t->exit_state)) { 530 if (likely(!(t->flags & PF_EXITING))) {
534 ns = t->sched_time + nsleft; 531 ns = t->sched_time + nsleft;
535 if (t->it_sched_expires == 0 || 532 if (t->it_sched_expires == 0 ||
536 t->it_sched_expires > ns) { 533 t->it_sched_expires > ns) {
@@ -569,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
569 struct cpu_timer_list *next; 566 struct cpu_timer_list *next;
570 unsigned long i; 567 unsigned long i;
571 568
569 if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
570 return;
571
572 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 572 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
573 p->cpu_timers : p->signal->cpu_timers); 573 p->cpu_timers : p->signal->cpu_timers);
574 head += CPUCLOCK_WHICH(timer->it_clock); 574 head += CPUCLOCK_WHICH(timer->it_clock);
@@ -579,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
579 listpos = head; 579 listpos = head;
580 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 580 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
581 list_for_each_entry(next, head, entry) { 581 list_for_each_entry(next, head, entry) {
582 if (next->expires.sched > nt->expires.sched) { 582 if (next->expires.sched > nt->expires.sched)
583 listpos = &next->entry;
584 break; 583 break;
585 } 584 listpos = &next->entry;
586 } 585 }
587 } else { 586 } else {
588 list_for_each_entry(next, head, entry) { 587 list_for_each_entry(next, head, entry) {
589 if (cputime_gt(next->expires.cpu, nt->expires.cpu)) { 588 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
590 listpos = &next->entry;
591 break; 589 break;
592 } 590 listpos = &next->entry;
593 } 591 }
594 } 592 }
595 list_add(&nt->entry, listpos); 593 list_add(&nt->entry, listpos);
@@ -733,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
733 * Disarm any old timer after extracting its expiry time. 731 * Disarm any old timer after extracting its expiry time.
734 */ 732 */
735 BUG_ON(!irqs_disabled()); 733 BUG_ON(!irqs_disabled());
734
735 ret = 0;
736 spin_lock(&p->sighand->siglock); 736 spin_lock(&p->sighand->siglock);
737 old_expires = timer->it.cpu.expires; 737 old_expires = timer->it.cpu.expires;
738 list_del_init(&timer->it.cpu.entry); 738 if (unlikely(timer->it.cpu.firing)) {
739 timer->it.cpu.firing = -1;
740 ret = TIMER_RETRY;
741 } else
742 list_del_init(&timer->it.cpu.entry);
739 spin_unlock(&p->sighand->siglock); 743 spin_unlock(&p->sighand->siglock);
740 744
741 /* 745 /*
@@ -783,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
783 } 787 }
784 } 788 }
785 789
786 if (unlikely(timer->it.cpu.firing)) { 790 if (unlikely(ret)) {
787 /* 791 /*
788 * We are colliding with the timer actually firing. 792 * We are colliding with the timer actually firing.
789 * Punt after filling in the timer's old value, and 793 * Punt after filling in the timer's old value, and
@@ -791,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
791 * it as an overrun (thanks to bump_cpu_timer above). 795 * it as an overrun (thanks to bump_cpu_timer above).
792 */ 796 */
793 read_unlock(&tasklist_lock); 797 read_unlock(&tasklist_lock);
794 timer->it.cpu.firing = -1;
795 ret = TIMER_RETRY;
796 goto out; 798 goto out;
797 } 799 }
798 800
@@ -958,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
958static void check_thread_timers(struct task_struct *tsk, 960static void check_thread_timers(struct task_struct *tsk,
959 struct list_head *firing) 961 struct list_head *firing)
960{ 962{
963 int maxfire;
961 struct list_head *timers = tsk->cpu_timers; 964 struct list_head *timers = tsk->cpu_timers;
962 965
966 maxfire = 20;
963 tsk->it_prof_expires = cputime_zero; 967 tsk->it_prof_expires = cputime_zero;
964 while (!list_empty(timers)) { 968 while (!list_empty(timers)) {
965 struct cpu_timer_list *t = list_entry(timers->next, 969 struct cpu_timer_list *t = list_entry(timers->next,
966 struct cpu_timer_list, 970 struct cpu_timer_list,
967 entry); 971 entry);
968 if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 972 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
969 tsk->it_prof_expires = t->expires.cpu; 973 tsk->it_prof_expires = t->expires.cpu;
970 break; 974 break;
971 } 975 }
@@ -974,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
974 } 978 }
975 979
976 ++timers; 980 ++timers;
981 maxfire = 20;
977 tsk->it_virt_expires = cputime_zero; 982 tsk->it_virt_expires = cputime_zero;
978 while (!list_empty(timers)) { 983 while (!list_empty(timers)) {
979 struct cpu_timer_list *t = list_entry(timers->next, 984 struct cpu_timer_list *t = list_entry(timers->next,
980 struct cpu_timer_list, 985 struct cpu_timer_list,
981 entry); 986 entry);
982 if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 987 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
983 tsk->it_virt_expires = t->expires.cpu; 988 tsk->it_virt_expires = t->expires.cpu;
984 break; 989 break;
985 } 990 }
@@ -988,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
988 } 993 }
989 994
990 ++timers; 995 ++timers;
996 maxfire = 20;
991 tsk->it_sched_expires = 0; 997 tsk->it_sched_expires = 0;
992 while (!list_empty(timers)) { 998 while (!list_empty(timers)) {
993 struct cpu_timer_list *t = list_entry(timers->next, 999 struct cpu_timer_list *t = list_entry(timers->next,
994 struct cpu_timer_list, 1000 struct cpu_timer_list,
995 entry); 1001 entry);
996 if (tsk->sched_time < t->expires.sched) { 1002 if (!--maxfire || tsk->sched_time < t->expires.sched) {
997 tsk->it_sched_expires = t->expires.sched; 1003 tsk->it_sched_expires = t->expires.sched;
998 break; 1004 break;
999 } 1005 }
@@ -1010,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
1010static void check_process_timers(struct task_struct *tsk, 1016static void check_process_timers(struct task_struct *tsk,
1011 struct list_head *firing) 1017 struct list_head *firing)
1012{ 1018{
1019 int maxfire;
1013 struct signal_struct *const sig = tsk->signal; 1020 struct signal_struct *const sig = tsk->signal;
1014 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1021 cputime_t utime, stime, ptime, virt_expires, prof_expires;
1015 unsigned long long sched_time, sched_expires; 1022 unsigned long long sched_time, sched_expires;
@@ -1042,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
1042 } while (t != tsk); 1049 } while (t != tsk);
1043 ptime = cputime_add(utime, stime); 1050 ptime = cputime_add(utime, stime);
1044 1051
1052 maxfire = 20;
1045 prof_expires = cputime_zero; 1053 prof_expires = cputime_zero;
1046 while (!list_empty(timers)) { 1054 while (!list_empty(timers)) {
1047 struct cpu_timer_list *t = list_entry(timers->next, 1055 struct cpu_timer_list *t = list_entry(timers->next,
1048 struct cpu_timer_list, 1056 struct cpu_timer_list,
1049 entry); 1057 entry);
1050 if (cputime_lt(ptime, t->expires.cpu)) { 1058 if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
1051 prof_expires = t->expires.cpu; 1059 prof_expires = t->expires.cpu;
1052 break; 1060 break;
1053 } 1061 }
@@ -1056,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
1056 } 1064 }
1057 1065
1058 ++timers; 1066 ++timers;
1067 maxfire = 20;
1059 virt_expires = cputime_zero; 1068 virt_expires = cputime_zero;
1060 while (!list_empty(timers)) { 1069 while (!list_empty(timers)) {
1061 struct cpu_timer_list *t = list_entry(timers->next, 1070 struct cpu_timer_list *t = list_entry(timers->next,
1062 struct cpu_timer_list, 1071 struct cpu_timer_list,
1063 entry); 1072 entry);
1064 if (cputime_lt(utime, t->expires.cpu)) { 1073 if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
1065 virt_expires = t->expires.cpu; 1074 virt_expires = t->expires.cpu;
1066 break; 1075 break;
1067 } 1076 }
@@ -1070,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
1070 } 1079 }
1071 1080
1072 ++timers; 1081 ++timers;
1082 maxfire = 20;
1073 sched_expires = 0; 1083 sched_expires = 0;
1074 while (!list_empty(timers)) { 1084 while (!list_empty(timers)) {
1075 struct cpu_timer_list *t = list_entry(timers->next, 1085 struct cpu_timer_list *t = list_entry(timers->next,
1076 struct cpu_timer_list, 1086 struct cpu_timer_list,
1077 entry); 1087 entry);
1078 if (sched_time < t->expires.sched) { 1088 if (!--maxfire || sched_time < t->expires.sched) {
1079 sched_expires = t->expires.sched; 1089 sched_expires = t->expires.sched;
1080 break; 1090 break;
1081 } 1091 }
@@ -1158,6 +1168,9 @@ static void check_process_timers(struct task_struct *tsk,
1158 unsigned long long sched_left, sched; 1168 unsigned long long sched_left, sched;
1159 const unsigned int nthreads = atomic_read(&sig->live); 1169 const unsigned int nthreads = atomic_read(&sig->live);
1160 1170
1171 if (!nthreads)
1172 return;
1173
1161 prof_left = cputime_sub(prof_expires, utime); 1174 prof_left = cputime_sub(prof_expires, utime);
1162 prof_left = cputime_sub(prof_left, stime); 1175 prof_left = cputime_sub(prof_left, stime);
1163 prof_left = cputime_div(prof_left, nthreads); 1176 prof_left = cputime_div(prof_left, nthreads);
@@ -1194,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
1194 1207
1195 do { 1208 do {
1196 t = next_thread(t); 1209 t = next_thread(t);
1197 } while (unlikely(t->exit_state)); 1210 } while (unlikely(t->flags & PF_EXITING));
1198 } while (t != tsk); 1211 } while (t != tsk);
1199 } 1212 }
1200} 1213}
@@ -1212,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1212 /* 1225 /*
1213 * The task was cleaned up already, no future firings. 1226 * The task was cleaned up already, no future firings.
1214 */ 1227 */
1215 return; 1228 goto out;
1216 1229
1217 /* 1230 /*
1218 * Fetch the current sample and update the timer's expiry time. 1231 * Fetch the current sample and update the timer's expiry time.
@@ -1222,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1222 bump_cpu_timer(timer, now); 1235 bump_cpu_timer(timer, now);
1223 if (unlikely(p->exit_state)) { 1236 if (unlikely(p->exit_state)) {
1224 clear_dead_task(timer, now); 1237 clear_dead_task(timer, now);
1225 return; 1238 goto out;
1226 } 1239 }
1227 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1240 read_lock(&tasklist_lock); /* arm_timer needs it. */
1228 } else { 1241 } else {
@@ -1235,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1235 put_task_struct(p); 1248 put_task_struct(p);
1236 timer->it.cpu.task = p = NULL; 1249 timer->it.cpu.task = p = NULL;
1237 timer->it.cpu.expires.sched = 0; 1250 timer->it.cpu.expires.sched = 0;
1238 read_unlock(&tasklist_lock); 1251 goto out_unlock;
1239 return;
1240 } else if (unlikely(p->exit_state) && thread_group_empty(p)) { 1252 } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
1241 /* 1253 /*
1242 * We've noticed that the thread is dead, but 1254 * We've noticed that the thread is dead, but
@@ -1244,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1244 * drop our task ref. 1256 * drop our task ref.
1245 */ 1257 */
1246 clear_dead_task(timer, now); 1258 clear_dead_task(timer, now);
1247 read_unlock(&tasklist_lock); 1259 goto out_unlock;
1248 return;
1249 } 1260 }
1250 cpu_clock_sample_group(timer->it_clock, p, &now); 1261 cpu_clock_sample_group(timer->it_clock, p, &now);
1251 bump_cpu_timer(timer, now); 1262 bump_cpu_timer(timer, now);
@@ -1257,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1257 */ 1268 */
1258 arm_timer(timer, now); 1269 arm_timer(timer, now);
1259 1270
1271out_unlock:
1260 read_unlock(&tasklist_lock); 1272 read_unlock(&tasklist_lock);
1273
1274out:
1275 timer->it_overrun_last = timer->it_overrun;
1276 timer->it_overrun = -1;
1277 ++timer->it_requeue_pending;
1261} 1278}
1262 1279
1263/* 1280/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b7b532acd9fc..5870efb3e200 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -270,7 +270,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
270 long sec = tp->tv_sec; 270 long sec = tp->tv_sec;
271 long nsec = tp->tv_nsec + res - 1; 271 long nsec = tp->tv_nsec + res - 1;
272 272
273 if (nsec > NSEC_PER_SEC) { 273 if (nsec >= NSEC_PER_SEC) {
274 sec++; 274 sec++;
275 nsec -= NSEC_PER_SEC; 275 nsec -= NSEC_PER_SEC;
276 } 276 }
@@ -1157,7 +1157,7 @@ retry_delete:
1157} 1157}
1158 1158
1159/* 1159/*
1160 * This is called by __exit_signal, only when there are no more 1160 * This is called by do_exit or de_thread, only when there are no more
1161 * references to the shared signal_struct. 1161 * references to the shared signal_struct.
1162 */ 1162 */
1163void exit_itimers(struct signal_struct *sig) 1163void exit_itimers(struct signal_struct *sig)
@@ -1209,13 +1209,9 @@ static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
1209 1209
1210 do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono); 1210 do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
1211 1211
1212 tp->tv_sec += wall_to_mono.tv_sec; 1212 set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
1213 tp->tv_nsec += wall_to_mono.tv_nsec; 1213 tp->tv_nsec + wall_to_mono.tv_nsec);
1214 1214
1215 if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
1216 tp->tv_nsec -= NSEC_PER_SEC;
1217 tp->tv_sec++;
1218 }
1219 return 0; 1215 return 0;
1220} 1216}
1221 1217
@@ -1295,13 +1291,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
1295 return error; 1291 return error;
1296} 1292}
1297 1293
1298static void nanosleep_wake_up(unsigned long __data)
1299{
1300 struct task_struct *p = (struct task_struct *) __data;
1301
1302 wake_up_process(p);
1303}
1304
1305/* 1294/*
1306 * The standard says that an absolute nanosleep call MUST wake up at 1295 * The standard says that an absolute nanosleep call MUST wake up at
1307 * the requested time in spite of clock settings. Here is what we do: 1296 * the requested time in spite of clock settings. Here is what we do:
@@ -1442,7 +1431,6 @@ static int common_nsleep(clockid_t which_clock,
1442 int flags, struct timespec *tsave) 1431 int flags, struct timespec *tsave)
1443{ 1432{
1444 struct timespec t, dum; 1433 struct timespec t, dum;
1445 struct timer_list new_timer;
1446 DECLARE_WAITQUEUE(abs_wqueue, current); 1434 DECLARE_WAITQUEUE(abs_wqueue, current);
1447 u64 rq_time = (u64)0; 1435 u64 rq_time = (u64)0;
1448 s64 left; 1436 s64 left;
@@ -1451,10 +1439,6 @@ static int common_nsleep(clockid_t which_clock,
1451 &current_thread_info()->restart_block; 1439 &current_thread_info()->restart_block;
1452 1440
1453 abs_wqueue.flags = 0; 1441 abs_wqueue.flags = 0;
1454 init_timer(&new_timer);
1455 new_timer.expires = 0;
1456 new_timer.data = (unsigned long) current;
1457 new_timer.function = nanosleep_wake_up;
1458 abs = flags & TIMER_ABSTIME; 1442 abs = flags & TIMER_ABSTIME;
1459 1443
1460 if (restart_block->fn == clock_nanosleep_restart) { 1444 if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1474,8 @@ static int common_nsleep(clockid_t which_clock,
1490 if (left < (s64)0) 1474 if (left < (s64)0)
1491 break; 1475 break;
1492 1476
1493 new_timer.expires = jiffies + left; 1477 schedule_timeout_interruptible(left);
1494 __set_current_state(TASK_INTERRUPTIBLE);
1495 add_timer(&new_timer);
1496
1497 schedule();
1498 1478
1499 del_timer_sync(&new_timer);
1500 left = rq_time - get_jiffies_64(); 1479 left = rq_time - get_jiffies_64();
1501 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING)); 1480 } while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
1502 1481
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 396c7873e804..5ec248cb7f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,6 +19,15 @@ config PM
19 will issue the hlt instruction if nothing is to be done, thereby 19 will issue the hlt instruction if nothing is to be done, thereby
20 sending the processor to sleep and saving power. 20 sending the processor to sleep and saving power.
21 21
22config PM_LEGACY
23 bool "Legacy Power Management API"
24 depends on PM
25 default y
26 ---help---
27 Support for pm_register() and friends.
28
29 If unsure, say Y.
30
22config PM_DEBUG 31config PM_DEBUG
23 bool "Power Management Debug Support" 32 bool "Power Management Debug Support"
24 depends on PM 33 depends on PM
@@ -29,7 +38,7 @@ config PM_DEBUG
29 38
30config SOFTWARE_SUSPEND 39config SOFTWARE_SUSPEND
31 bool "Software Suspend" 40 bool "Software Suspend"
32 depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) 41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
33 ---help--- 42 ---help---
34 Enable the possibility of suspending the machine. 43 Enable the possibility of suspending the machine.
35 It doesn't need APM. 44 It doesn't need APM.
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..04be7d0d96a7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,8 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
3EXTRA_CFLAGS += -DDEBUG 3EXTRA_CFLAGS += -DDEBUG
4endif 4endif
5 5
6obj-y := main.o process.o console.o pm.o 6obj-y := main.o process.o console.o
7obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o
8 9
9obj-$(CONFIG_SUSPEND_SMP) += smp.o 10obj-$(CONFIG_SUSPEND_SMP) += smp.o
10 11
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d8bf054d036..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h>
20 21
21#include "power.h" 22#include "power.h"
22 23
23 24
24extern suspend_disk_method_t pm_disk_mode; 25extern suspend_disk_method_t pm_disk_mode;
25extern struct pm_ops * pm_ops;
26 26
27extern int swsusp_suspend(void); 27extern int swsusp_suspend(void);
28extern int swsusp_write(void); 28extern int swsusp_write(void);
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
30extern int swsusp_read(void); 30extern int swsusp_read(void);
31extern void swsusp_close(void); 31extern void swsusp_close(void);
32extern int swsusp_resume(void); 32extern int swsusp_resume(void);
33extern int swsusp_free(void);
34 33
35 34
36static int noresume = 0; 35static int noresume = 0;
@@ -49,13 +48,11 @@ dev_t swsusp_resume_device;
49 48
50static void power_down(suspend_disk_method_t mode) 49static void power_down(suspend_disk_method_t mode)
51{ 50{
52 unsigned long flags;
53 int error = 0; 51 int error = 0;
54 52
55 local_irq_save(flags);
56 switch(mode) { 53 switch(mode) {
57 case PM_DISK_PLATFORM: 54 case PM_DISK_PLATFORM:
58 device_shutdown(); 55 kernel_power_off_prepare();
59 error = pm_ops->enter(PM_SUSPEND_DISK); 56 error = pm_ops->enter(PM_SUSPEND_DISK);
60 break; 57 break;
61 case PM_DISK_SHUTDOWN: 58 case PM_DISK_SHUTDOWN:
@@ -95,10 +92,7 @@ static void free_some_memory(void)
95 printk("Freeing memory... "); 92 printk("Freeing memory... ");
96 while ((tmp = shrink_all_memory(10000))) { 93 while ((tmp = shrink_all_memory(10000))) {
97 pages += tmp; 94 pages += tmp;
98 printk("\b%c", p[i]); 95 printk("\b%c", p[i++ % 4]);
99 i++;
100 if (i > 3)
101 i = 0;
102 } 96 }
103 printk("\bdone (%li pages freed)\n", pages); 97 printk("\bdone (%li pages freed)\n", pages);
104} 98}
@@ -180,13 +174,12 @@ int pm_suspend_disk(void)
180 goto Done; 174 goto Done;
181 175
182 if (in_suspend) { 176 if (in_suspend) {
177 device_resume();
183 pr_debug("PM: writing image.\n"); 178 pr_debug("PM: writing image.\n");
184 error = swsusp_write(); 179 error = swsusp_write();
185 if (!error) 180 if (!error)
186 power_down(pm_disk_mode); 181 power_down(pm_disk_mode);
187 else { 182 else {
188 /* swsusp_write can not fail in device_resume,
189 no need to do second device_resume */
190 swsusp_free(); 183 swsusp_free();
191 unprepare_processes(); 184 unprepare_processes();
192 return error; 185 return error;
@@ -254,14 +247,17 @@ static int software_resume(void)
254 247
255 pr_debug("PM: Reading swsusp image.\n"); 248 pr_debug("PM: Reading swsusp image.\n");
256 249
257 if ((error = swsusp_read())) 250 if ((error = swsusp_read())) {
258 goto Cleanup; 251 swsusp_free();
252 goto Thaw;
253 }
259 254
260 pr_debug("PM: Preparing devices for restore.\n"); 255 pr_debug("PM: Preparing devices for restore.\n");
261 256
262 if ((error = device_suspend(PMSG_FREEZE))) { 257 if ((error = device_suspend(PMSG_FREEZE))) {
263 printk("Some devices failed to suspend\n"); 258 printk("Some devices failed to suspend\n");
264 goto Free; 259 swsusp_free();
260 goto Thaw;
265 } 261 }
266 262
267 mb(); 263 mb();
@@ -270,9 +266,7 @@ static int software_resume(void)
270 swsusp_resume(); 266 swsusp_resume();
271 pr_debug("PM: Restore failed, recovering.n"); 267 pr_debug("PM: Restore failed, recovering.n");
272 device_resume(); 268 device_resume();
273 Free: 269 Thaw:
274 swsusp_free();
275 Cleanup:
276 unprepare_processes(); 270 unprepare_processes();
277 Done: 271 Done:
278 /* For success case, the suspend path will release the lock */ 272 /* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc038..d253f3ae2fa5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
24 24
25DECLARE_MUTEX(pm_sem); 25DECLARE_MUTEX(pm_sem);
26 26
27struct pm_ops * pm_ops = NULL; 27struct pm_ops *pm_ops;
28suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; 28suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
29 29
30/** 30/**
@@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = {
151#endif 151#endif
152}; 152};
153 153
154static inline int valid_state(suspend_state_t state)
155{
156 /* Suspend-to-disk does not really need low-level support.
157 * It can work with reboot if needed. */
158 if (state == PM_SUSPEND_DISK)
159 return 1;
160
161 if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
162 return 0;
163 return 1;
164}
165
154 166
155/** 167/**
156 * enter_state - Do common work of entering low-power state. 168 * enter_state - Do common work of entering low-power state.
@@ -167,6 +179,8 @@ static int enter_state(suspend_state_t state)
167{ 179{
168 int error; 180 int error;
169 181
182 if (!valid_state(state))
183 return -ENODEV;
170 if (down_trylock(&pm_sem)) 184 if (down_trylock(&pm_sem))
171 return -EBUSY; 185 return -EBUSY;
172 186
@@ -236,8 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
236 char * s = buf; 250 char * s = buf;
237 251
238 for (i = 0; i < PM_SUSPEND_MAX; i++) { 252 for (i = 0; i < PM_SUSPEND_MAX; i++) {
239 if (pm_states[i]) 253 if (pm_states[i] && valid_state(i))
240 s += sprintf(s,"%s ",pm_states[i]); 254 s += sprintf(s,"%s ", pm_states[i]);
241 } 255 }
242 s += sprintf(s,"\n"); 256 s += sprintf(s,"\n");
243 return (s - buf); 257 return (s - buf);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 159149321b3c..33c508e857dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -23,6 +23,7 @@
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/pm_legacy.h>
26#include <linux/interrupt.h> 27#include <linux/interrupt.h>
27 28
28int pm_active; 29int pm_active;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..6c042b5ee14b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
1#include <linux/suspend.h> 1#include <linux/suspend.h>
2#include <linux/utsname.h> 2#include <linux/utsname.h>
3 3
4/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but 4/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
5 we probably do not take enough locks for switching consoles, etc, 5 we probably do not take enough locks for switching consoles, etc,
6 so bad things might happen. 6 so bad things might happen.
7*/ 7*/
@@ -9,6 +9,9 @@
9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 9#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
10#endif 10#endif
11 11
12#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \
13 - 4 - 3*sizeof(unsigned long) - sizeof(int) \
14 - sizeof(void *)) / sizeof(swp_entry_t))
12 15
13struct swsusp_info { 16struct swsusp_info {
14 struct new_utsname uts; 17 struct new_utsname uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
18 unsigned long image_pages; 21 unsigned long image_pages;
19 unsigned long pagedir_pages; 22 unsigned long pagedir_pages;
20 suspend_pagedir_t * suspend_pagedir; 23 suspend_pagedir_t * suspend_pagedir;
21 swp_entry_t pagedir[768]; 24 swp_entry_t pagedir[MAX_PBES];
22} __attribute__((aligned(PAGE_SIZE))); 25} __attribute__((aligned(PAGE_SIZE)));
23 26
24 27
@@ -50,3 +53,20 @@ extern void thaw_processes(void);
50 53
51extern int pm_prepare_console(void); 54extern int pm_prepare_console(void);
52extern void pm_restore_console(void); 55extern void pm_restore_console(void);
56
57
58/* References to section boundaries */
59extern const void __nosave_begin, __nosave_end;
60
61extern unsigned int nr_copy_pages;
62extern suspend_pagedir_t *pagedir_nosave;
63extern suspend_pagedir_t *pagedir_save;
64
65extern asmlinkage int swsusp_arch_suspend(void);
66extern asmlinkage int swsusp_arch_resume(void);
67
68extern void free_pagedir(struct pbe *pblist);
69extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
70extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
71extern void swsusp_free(void);
72extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..4a6dbcefd378
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,453 @@
1/*
2 * linux/kernel/power/snapshot.c
3 *
4 * This file provide system snapshot/restore functionality.
5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
7 *
8 * This file is released under the GPLv2, and is based on swsusp.c.
9 *
10 */
11
12
13#include <linux/module.h>
14#include <linux/mm.h>
15#include <linux/suspend.h>
16#include <linux/smp_lock.h>
17#include <linux/delay.h>
18#include <linux/bitops.h>
19#include <linux/spinlock.h>
20#include <linux/kernel.h>
21#include <linux/pm.h>
22#include <linux/device.h>
23#include <linux/bootmem.h>
24#include <linux/syscalls.h>
25#include <linux/console.h>
26#include <linux/highmem.h>
27
28#include <asm/uaccess.h>
29#include <asm/mmu_context.h>
30#include <asm/pgtable.h>
31#include <asm/tlbflush.h>
32#include <asm/io.h>
33
34#include "power.h"
35
36#ifdef CONFIG_HIGHMEM
37struct highmem_page {
38 char *data;
39 struct page *page;
40 struct highmem_page *next;
41};
42
43static struct highmem_page *highmem_copy;
44
45static int save_highmem_zone(struct zone *zone)
46{
47 unsigned long zone_pfn;
48 mark_free_pages(zone);
49 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
50 struct page *page;
51 struct highmem_page *save;
52 void *kaddr;
53 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
54
55 if (!(pfn%1000))
56 printk(".");
57 if (!pfn_valid(pfn))
58 continue;
59 page = pfn_to_page(pfn);
60 /*
61 * This condition results from rvmalloc() sans vmalloc_32()
62 * and architectural memory reservations. This should be
63 * corrected eventually when the cases giving rise to this
64 * are better understood.
65 */
66 if (PageReserved(page)) {
67 printk("highmem reserved page?!\n");
68 continue;
69 }
70 BUG_ON(PageNosave(page));
71 if (PageNosaveFree(page))
72 continue;
73 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
74 if (!save)
75 return -ENOMEM;
76 save->next = highmem_copy;
77 save->page = page;
78 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
79 if (!save->data) {
80 kfree(save);
81 return -ENOMEM;
82 }
83 kaddr = kmap_atomic(page, KM_USER0);
84 memcpy(save->data, kaddr, PAGE_SIZE);
85 kunmap_atomic(kaddr, KM_USER0);
86 highmem_copy = save;
87 }
88 return 0;
89}
90
91int save_highmem(void)
92{
93 struct zone *zone;
94 int res = 0;
95
96 pr_debug("swsusp: Saving Highmem\n");
97 for_each_zone (zone) {
98 if (is_highmem(zone))
99 res = save_highmem_zone(zone);
100 if (res)
101 return res;
102 }
103 return 0;
104}
105
106int restore_highmem(void)
107{
108 printk("swsusp: Restoring Highmem\n");
109 while (highmem_copy) {
110 struct highmem_page *save = highmem_copy;
111 void *kaddr;
112 highmem_copy = save->next;
113
114 kaddr = kmap_atomic(save->page, KM_USER0);
115 memcpy(kaddr, save->data, PAGE_SIZE);
116 kunmap_atomic(kaddr, KM_USER0);
117 free_page((long) save->data);
118 kfree(save);
119 }
120 return 0;
121}
122#endif
123
124static int pfn_is_nosave(unsigned long pfn)
125{
126 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
127 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
128 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
129}
130
131/**
132 * saveable - Determine whether a page should be cloned or not.
133 * @pfn: The page
134 *
135 * We save a page if it's Reserved, and not in the range of pages
136 * statically defined as 'unsaveable', or if it isn't reserved, and
137 * isn't part of a free chunk of pages.
138 */
139
140static int saveable(struct zone *zone, unsigned long *zone_pfn)
141{
142 unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
143 struct page *page;
144
145 if (!pfn_valid(pfn))
146 return 0;
147
148 page = pfn_to_page(pfn);
149 BUG_ON(PageReserved(page) && PageNosave(page));
150 if (PageNosave(page))
151 return 0;
152 if (PageReserved(page) && pfn_is_nosave(pfn)) {
153 pr_debug("[nosave pfn 0x%lx]", pfn);
154 return 0;
155 }
156 if (PageNosaveFree(page))
157 return 0;
158
159 return 1;
160}
161
162static unsigned count_data_pages(void)
163{
164 struct zone *zone;
165 unsigned long zone_pfn;
166 unsigned int n = 0;
167
168 for_each_zone (zone) {
169 if (is_highmem(zone))
170 continue;
171 mark_free_pages(zone);
172 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
173 n += saveable(zone, &zone_pfn);
174 }
175 return n;
176}
177
178static void copy_data_pages(struct pbe *pblist)
179{
180 struct zone *zone;
181 unsigned long zone_pfn;
182 struct pbe *pbe, *p;
183
184 pbe = pblist;
185 for_each_zone (zone) {
186 if (is_highmem(zone))
187 continue;
188 mark_free_pages(zone);
189 /* This is necessary for swsusp_free() */
190 for_each_pb_page (p, pblist)
191 SetPageNosaveFree(virt_to_page(p));
192 for_each_pbe (p, pblist)
193 SetPageNosaveFree(virt_to_page(p->address));
194 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
195 if (saveable(zone, &zone_pfn)) {
196 struct page *page;
197 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
198 BUG_ON(!pbe);
199 pbe->orig_address = (unsigned long)page_address(page);
200 /* copy_page is not usable for copying task structs. */
201 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
202 pbe = pbe->next;
203 }
204 }
205 }
206 BUG_ON(pbe);
207}
208
209
210/**
211 * free_pagedir - free pages allocated with alloc_pagedir()
212 */
213
214void free_pagedir(struct pbe *pblist)
215{
216 struct pbe *pbe;
217
218 while (pblist) {
219 pbe = (pblist + PB_PAGE_SKIP)->next;
220 ClearPageNosave(virt_to_page(pblist));
221 ClearPageNosaveFree(virt_to_page(pblist));
222 free_page((unsigned long)pblist);
223 pblist = pbe;
224 }
225}
226
227/**
228 * fill_pb_page - Create a list of PBEs on a given memory page
229 */
230
231static inline void fill_pb_page(struct pbe *pbpage)
232{
233 struct pbe *p;
234
235 p = pbpage;
236 pbpage += PB_PAGE_SKIP;
237 do
238 p->next = p + 1;
239 while (++p < pbpage);
240}
241
242/**
243 * create_pbe_list - Create a list of PBEs on top of a given chain
244 * of memory pages allocated with alloc_pagedir()
245 */
246
247void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
248{
249 struct pbe *pbpage, *p;
250 unsigned int num = PBES_PER_PAGE;
251
252 for_each_pb_page (pbpage, pblist) {
253 if (num >= nr_pages)
254 break;
255
256 fill_pb_page(pbpage);
257 num += PBES_PER_PAGE;
258 }
259 if (pbpage) {
260 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
261 p->next = p + 1;
262 p->next = NULL;
263 }
264 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
265}
266
267/**
268 * @safe_needed - on resume, for storing the PBE list and the image,
269 * we can only use memory pages that do not conflict with the pages
270 * which had been used before suspend.
271 *
272 * The unsafe pages are marked with the PG_nosave_free flag
273 *
274 * Allocated but unusable (ie eaten) memory pages should be marked
275 * so that swsusp_free() can release them
276 */
277
278static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
279{
280 void *res;
281
282 if (safe_needed)
283 do {
284 res = (void *)get_zeroed_page(gfp_mask);
285 if (res && PageNosaveFree(virt_to_page(res)))
286 /* This is for swsusp_free() */
287 SetPageNosave(virt_to_page(res));
288 } while (res && PageNosaveFree(virt_to_page(res)));
289 else
290 res = (void *)get_zeroed_page(gfp_mask);
291 if (res) {
292 SetPageNosave(virt_to_page(res));
293 SetPageNosaveFree(virt_to_page(res));
294 }
295 return res;
296}
297
298unsigned long get_safe_page(gfp_t gfp_mask)
299{
300 return (unsigned long)alloc_image_page(gfp_mask, 1);
301}
302
303/**
304 * alloc_pagedir - Allocate the page directory.
305 *
306 * First, determine exactly how many pages we need and
307 * allocate them.
308 *
309 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
310 * struct pbe elements (pbes) and the last element in the page points
311 * to the next page.
312 *
313 * On each page we set up a list of struct_pbe elements.
314 */
315
316struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
317{
318 unsigned int num;
319 struct pbe *pblist, *pbe;
320
321 if (!nr_pages)
322 return NULL;
323
324 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
325 pblist = alloc_image_page(gfp_mask, safe_needed);
326 /* FIXME: rewrite this ugly loop */
327 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
328 pbe = pbe->next, num += PBES_PER_PAGE) {
329 pbe += PB_PAGE_SKIP;
330 pbe->next = alloc_image_page(gfp_mask, safe_needed);
331 }
332 if (!pbe) { /* get_zeroed_page() failed */
333 free_pagedir(pblist);
334 pblist = NULL;
335 }
336 return pblist;
337}
338
339/**
340 * Free pages we allocated for suspend. Suspend pages are alocated
341 * before atomic copy, so we need to free them after resume.
342 */
343
344void swsusp_free(void)
345{
346 struct zone *zone;
347 unsigned long zone_pfn;
348
349 for_each_zone(zone) {
350 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
351 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
352 struct page *page;
353 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
354 if (PageNosave(page) && PageNosaveFree(page)) {
355 ClearPageNosave(page);
356 ClearPageNosaveFree(page);
357 free_page((long) page_address(page));
358 }
359 }
360 }
361}
362
363
364/**
365 * enough_free_mem - Make sure we enough free memory to snapshot.
366 *
367 * Returns TRUE or FALSE after checking the number of available
368 * free pages.
369 */
370
371static int enough_free_mem(unsigned int nr_pages)
372{
373 pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
374 return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
375 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
376}
377
378int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
379{
380 struct pbe *p;
381
382 for_each_pbe (p, pblist) {
383 p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
384 if (!p->address)
385 return -ENOMEM;
386 }
387 return 0;
388}
389
390static struct pbe *swsusp_alloc(unsigned int nr_pages)
391{
392 struct pbe *pblist;
393
394 if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
395 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
396 return NULL;
397 }
398 create_pbe_list(pblist, nr_pages);
399
400 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
401 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
402 swsusp_free();
403 return NULL;
404 }
405
406 return pblist;
407}
408
409asmlinkage int swsusp_save(void)
410{
411 unsigned int nr_pages;
412
413 pr_debug("swsusp: critical section: \n");
414
415 drain_local_pages();
416 nr_pages = count_data_pages();
417 printk("swsusp: Need to copy %u pages\n", nr_pages);
418
419 pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
420 nr_pages,
421 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
422 PAGES_FOR_IO, nr_free_pages());
423
424 /* This is needed because of the fixed size of swsusp_info */
425 if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
426 return -ENOSPC;
427
428 if (!enough_free_mem(nr_pages)) {
429 printk(KERN_ERR "swsusp: Not enough free memory\n");
430 return -ENOMEM;
431 }
432
433 pagedir_nosave = swsusp_alloc(nr_pages);
434 if (!pagedir_nosave)
435 return -ENOMEM;
436
437 /* During allocating of suspend pagedir, new cold pages may appear.
438 * Kill them.
439 */
440 drain_local_pages();
441 copy_data_pages(pagedir_nosave);
442
443 /*
444 * End of critical section. From now on, we can write to memory,
445 * but we should not touch disk. This specially means we must _not_
446 * touch swap space! Except we must write out our image of course.
447 */
448
449 nr_copy_pages = nr_pages;
450
451 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
452 return 0;
453}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d967e875ee82..c05f46e7348f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,10 @@
1/* 1/*
2 * linux/kernel/power/swsusp.c 2 * linux/kernel/power/swsusp.c
3 * 3 *
4 * This file is to realize architecture-independent 4 * This file provides code to write suspend image to swap and read it back.
5 * machine suspend feature using pretty near only high-level routines
6 * 5 *
7 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> 6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
8 * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
9 * 8 *
10 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
11 * 10 *
@@ -47,11 +46,7 @@
47#include <linux/utsname.h> 46#include <linux/utsname.h>
48#include <linux/version.h> 47#include <linux/version.h>
49#include <linux/delay.h> 48#include <linux/delay.h>
50#include <linux/reboot.h>
51#include <linux/bitops.h> 49#include <linux/bitops.h>
52#include <linux/vt_kern.h>
53#include <linux/kbd_kern.h>
54#include <linux/keyboard.h>
55#include <linux/spinlock.h> 50#include <linux/spinlock.h>
56#include <linux/genhd.h> 51#include <linux/genhd.h>
57#include <linux/kernel.h> 52#include <linux/kernel.h>
@@ -63,10 +58,8 @@
63#include <linux/swapops.h> 58#include <linux/swapops.h>
64#include <linux/bootmem.h> 59#include <linux/bootmem.h>
65#include <linux/syscalls.h> 60#include <linux/syscalls.h>
66#include <linux/console.h>
67#include <linux/highmem.h> 61#include <linux/highmem.h>
68#include <linux/bio.h> 62#include <linux/bio.h>
69#include <linux/mount.h>
70 63
71#include <asm/uaccess.h> 64#include <asm/uaccess.h>
72#include <asm/mmu_context.h> 65#include <asm/mmu_context.h>
@@ -80,36 +73,31 @@
80 73
81#include "power.h" 74#include "power.h"
82 75
76#ifdef CONFIG_HIGHMEM
77int save_highmem(void);
78int restore_highmem(void);
79#else
80static int save_highmem(void) { return 0; }
81static int restore_highmem(void) { return 0; }
82#endif
83
83#define CIPHER "aes" 84#define CIPHER "aes"
84#define MAXKEY 32 85#define MAXKEY 32
85#define MAXIV 32 86#define MAXIV 32
86 87
87/* References to section boundaries */
88extern const void __nosave_begin, __nosave_end;
89
90/* Variables to be preserved over suspend */
91static int nr_copy_pages_check;
92
93extern char resume_file[]; 88extern char resume_file[];
94 89
95/* Local variables that should not be affected by save */ 90/* Local variables that should not be affected by save */
96static unsigned int nr_copy_pages __nosavedata = 0; 91unsigned int nr_copy_pages __nosavedata = 0;
97 92
98/* Suspend pagedir is allocated before final copy, therefore it 93/* Suspend pagedir is allocated before final copy, therefore it
99 must be freed after resume 94 must be freed after resume
100 95
101 Warning: this is evil. There are actually two pagedirs at time of
102 resume. One is "pagedir_save", which is empty frame allocated at
103 time of suspend, that must be freed. Second is "pagedir_nosave",
104 allocated at time of resume, that travels through memory not to
105 collide with anything.
106
107 Warning: this is even more evil than it seems. Pagedirs this file 96 Warning: this is even more evil than it seems. Pagedirs this file
108 talks about are completely different from page directories used by 97 talks about are completely different from page directories used by
109 MMU hardware. 98 MMU hardware.
110 */ 99 */
111suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; 100suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
112static suspend_pagedir_t *pagedir_save;
113 101
114#define SWSUSP_SIG "S1SUSPEND" 102#define SWSUSP_SIG "S1SUSPEND"
115 103
@@ -124,12 +112,6 @@ static struct swsusp_header {
124static struct swsusp_info swsusp_info; 112static struct swsusp_info swsusp_info;
125 113
126/* 114/*
127 * XXX: We try to keep some more pages free so that I/O operations succeed
128 * without paging. Might this be more?
129 */
130#define PAGES_FOR_IO 512
131
132/*
133 * Saving part... 115 * Saving part...
134 */ 116 */
135 117
@@ -141,8 +123,8 @@ static struct swsusp_info swsusp_info;
141static unsigned short swapfile_used[MAX_SWAPFILES]; 123static unsigned short swapfile_used[MAX_SWAPFILES];
142static unsigned short root_swap; 124static unsigned short root_swap;
143 125
144static int write_page(unsigned long addr, swp_entry_t * loc); 126static int write_page(unsigned long addr, swp_entry_t *loc);
145static int bio_read_page(pgoff_t page_off, void * page); 127static int bio_read_page(pgoff_t page_off, void *page);
146 128
147static u8 key_iv[MAXKEY+MAXIV]; 129static u8 key_iv[MAXKEY+MAXIV];
148 130
@@ -363,7 +345,7 @@ static void lock_swapdevices(void)
363} 345}
364 346
365/** 347/**
366 * write_swap_page - Write one page to a fresh swap location. 348 * write_page - Write one page to a fresh swap location.
367 * @addr: Address we're writing. 349 * @addr: Address we're writing.
368 * @loc: Place to store the entry we used. 350 * @loc: Place to store the entry we used.
369 * 351 *
@@ -374,7 +356,7 @@ static void lock_swapdevices(void)
374 * This is a partial improvement, since we will at least return other 356 * This is a partial improvement, since we will at least return other
375 * errors, though we need to eventually fix the damn code. 357 * errors, though we need to eventually fix the damn code.
376 */ 358 */
377static int write_page(unsigned long addr, swp_entry_t * loc) 359static int write_page(unsigned long addr, swp_entry_t *loc)
378{ 360{
379 swp_entry_t entry; 361 swp_entry_t entry;
380 int error = 0; 362 int error = 0;
@@ -402,15 +384,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
402static void data_free(void) 384static void data_free(void)
403{ 385{
404 swp_entry_t entry; 386 swp_entry_t entry;
405 int i; 387 struct pbe *p;
406 388
407 for (i = 0; i < nr_copy_pages; i++) { 389 for_each_pbe (p, pagedir_nosave) {
408 entry = (pagedir_nosave + i)->swap_address; 390 entry = p->swap_address;
409 if (entry.val) 391 if (entry.val)
410 swap_free(entry); 392 swap_free(entry);
411 else 393 else
412 break; 394 break;
413 (pagedir_nosave + i)->swap_address = (swp_entry_t){0};
414 } 395 }
415} 396}
416 397
@@ -512,8 +493,8 @@ static void free_pagedir_entries(void)
512static int write_pagedir(void) 493static int write_pagedir(void)
513{ 494{
514 int error = 0; 495 int error = 0;
515 unsigned n = 0; 496 unsigned int n = 0;
516 struct pbe * pbe; 497 struct pbe *pbe;
517 498
518 printk( "Writing pagedir..."); 499 printk( "Writing pagedir...");
519 for_each_pb_page (pbe, pagedir_nosave) { 500 for_each_pb_page (pbe, pagedir_nosave) {
@@ -527,6 +508,26 @@ static int write_pagedir(void)
527} 508}
528 509
529/** 510/**
511 * enough_swap - Make sure we have enough swap to save the image.
512 *
513 * Returns TRUE or FALSE after checking the total amount of swap
514 * space avaiable.
515 *
516 * FIXME: si_swapinfo(&i) returns all swap devices information.
517 * We should only consider resume_device.
518 */
519
520static int enough_swap(unsigned int nr_pages)
521{
522 struct sysinfo i;
523
524 si_swapinfo(&i);
525 pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
526 return i.freeswap > (nr_pages + PAGES_FOR_IO +
527 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
528}
529
530/**
530 * write_suspend_image - Write entire image and metadata. 531 * write_suspend_image - Write entire image and metadata.
531 * 532 *
532 */ 533 */
@@ -534,6 +535,11 @@ static int write_suspend_image(void)
534{ 535{
535 int error; 536 int error;
536 537
538 if (!enough_swap(nr_copy_pages)) {
539 printk(KERN_ERR "swsusp: Not enough free swap\n");
540 return -ENOSPC;
541 }
542
537 init_header(); 543 init_header();
538 if ((error = data_write())) 544 if ((error = data_write()))
539 goto FreeData; 545 goto FreeData;
@@ -553,433 +559,6 @@ static int write_suspend_image(void)
553 goto Done; 559 goto Done;
554} 560}
555 561
556
557#ifdef CONFIG_HIGHMEM
558struct highmem_page {
559 char *data;
560 struct page *page;
561 struct highmem_page *next;
562};
563
564static struct highmem_page *highmem_copy;
565
566static int save_highmem_zone(struct zone *zone)
567{
568 unsigned long zone_pfn;
569 mark_free_pages(zone);
570 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
571 struct page *page;
572 struct highmem_page *save;
573 void *kaddr;
574 unsigned long pfn = zone_pfn + zone->zone_start_pfn;
575
576 if (!(pfn%1000))
577 printk(".");
578 if (!pfn_valid(pfn))
579 continue;
580 page = pfn_to_page(pfn);
581 /*
582 * This condition results from rvmalloc() sans vmalloc_32()
583 * and architectural memory reservations. This should be
584 * corrected eventually when the cases giving rise to this
585 * are better understood.
586 */
587 if (PageReserved(page)) {
588 printk("highmem reserved page?!\n");
589 continue;
590 }
591 BUG_ON(PageNosave(page));
592 if (PageNosaveFree(page))
593 continue;
594 save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
595 if (!save)
596 return -ENOMEM;
597 save->next = highmem_copy;
598 save->page = page;
599 save->data = (void *) get_zeroed_page(GFP_ATOMIC);
600 if (!save->data) {
601 kfree(save);
602 return -ENOMEM;
603 }
604 kaddr = kmap_atomic(page, KM_USER0);
605 memcpy(save->data, kaddr, PAGE_SIZE);
606 kunmap_atomic(kaddr, KM_USER0);
607 highmem_copy = save;
608 }
609 return 0;
610}
611#endif /* CONFIG_HIGHMEM */
612
613
614static int save_highmem(void)
615{
616#ifdef CONFIG_HIGHMEM
617 struct zone *zone;
618 int res = 0;
619
620 pr_debug("swsusp: Saving Highmem\n");
621 for_each_zone (zone) {
622 if (is_highmem(zone))
623 res = save_highmem_zone(zone);
624 if (res)
625 return res;
626 }
627#endif
628 return 0;
629}
630
631static int restore_highmem(void)
632{
633#ifdef CONFIG_HIGHMEM
634 printk("swsusp: Restoring Highmem\n");
635 while (highmem_copy) {
636 struct highmem_page *save = highmem_copy;
637 void *kaddr;
638 highmem_copy = save->next;
639
640 kaddr = kmap_atomic(save->page, KM_USER0);
641 memcpy(kaddr, save->data, PAGE_SIZE);
642 kunmap_atomic(kaddr, KM_USER0);
643 free_page((long) save->data);
644 kfree(save);
645 }
646#endif
647 return 0;
648}
649
650
651static int pfn_is_nosave(unsigned long pfn)
652{
653 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
654 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
655 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
656}
657
658/**
659 * saveable - Determine whether a page should be cloned or not.
660 * @pfn: The page
661 *
662 * We save a page if it's Reserved, and not in the range of pages
663 * statically defined as 'unsaveable', or if it isn't reserved, and
664 * isn't part of a free chunk of pages.
665 */
666
667static int saveable(struct zone * zone, unsigned long * zone_pfn)
668{
669 unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
670 struct page * page;
671
672 if (!pfn_valid(pfn))
673 return 0;
674
675 page = pfn_to_page(pfn);
676 BUG_ON(PageReserved(page) && PageNosave(page));
677 if (PageNosave(page))
678 return 0;
679 if (PageReserved(page) && pfn_is_nosave(pfn)) {
680 pr_debug("[nosave pfn 0x%lx]", pfn);
681 return 0;
682 }
683 if (PageNosaveFree(page))
684 return 0;
685
686 return 1;
687}
688
689static void count_data_pages(void)
690{
691 struct zone *zone;
692 unsigned long zone_pfn;
693
694 nr_copy_pages = 0;
695
696 for_each_zone (zone) {
697 if (is_highmem(zone))
698 continue;
699 mark_free_pages(zone);
700 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
701 nr_copy_pages += saveable(zone, &zone_pfn);
702 }
703}
704
705
706static void copy_data_pages(void)
707{
708 struct zone *zone;
709 unsigned long zone_pfn;
710 struct pbe * pbe = pagedir_nosave;
711
712 pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
713 for_each_zone (zone) {
714 if (is_highmem(zone))
715 continue;
716 mark_free_pages(zone);
717 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
718 if (saveable(zone, &zone_pfn)) {
719 struct page * page;
720 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
721 BUG_ON(!pbe);
722 pbe->orig_address = (long) page_address(page);
723 /* copy_page is not usable for copying task structs. */
724 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
725 pbe = pbe->next;
726 }
727 }
728 }
729 BUG_ON(pbe);
730}
731
732
733/**
734 * calc_nr - Determine the number of pages needed for a pbe list.
735 */
736
737static int calc_nr(int nr_copy)
738{
739 return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
740}
741
742/**
743 * free_pagedir - free pages allocated with alloc_pagedir()
744 */
745
746static inline void free_pagedir(struct pbe *pblist)
747{
748 struct pbe *pbe;
749
750 while (pblist) {
751 pbe = (pblist + PB_PAGE_SKIP)->next;
752 free_page((unsigned long)pblist);
753 pblist = pbe;
754 }
755}
756
757/**
758 * fill_pb_page - Create a list of PBEs on a given memory page
759 */
760
761static inline void fill_pb_page(struct pbe *pbpage)
762{
763 struct pbe *p;
764
765 p = pbpage;
766 pbpage += PB_PAGE_SKIP;
767 do
768 p->next = p + 1;
769 while (++p < pbpage);
770}
771
772/**
773 * create_pbe_list - Create a list of PBEs on top of a given chain
774 * of memory pages allocated with alloc_pagedir()
775 */
776
777static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
778{
779 struct pbe *pbpage, *p;
780 unsigned num = PBES_PER_PAGE;
781
782 for_each_pb_page (pbpage, pblist) {
783 if (num >= nr_pages)
784 break;
785
786 fill_pb_page(pbpage);
787 num += PBES_PER_PAGE;
788 }
789 if (pbpage) {
790 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
791 p->next = p + 1;
792 p->next = NULL;
793 }
794 pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
795}
796
797/**
798 * alloc_pagedir - Allocate the page directory.
799 *
800 * First, determine exactly how many pages we need and
801 * allocate them.
802 *
803 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
804 * struct pbe elements (pbes) and the last element in the page points
805 * to the next page.
806 *
807 * On each page we set up a list of struct_pbe elements.
808 */
809
810static struct pbe * alloc_pagedir(unsigned nr_pages)
811{
812 unsigned num;
813 struct pbe *pblist, *pbe;
814
815 if (!nr_pages)
816 return NULL;
817
818 pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
819 pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
820 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
821 pbe = pbe->next, num += PBES_PER_PAGE) {
822 pbe += PB_PAGE_SKIP;
823 pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
824 }
825 if (!pbe) { /* get_zeroed_page() failed */
826 free_pagedir(pblist);
827 pblist = NULL;
828 }
829 return pblist;
830}
831
832/**
833 * free_image_pages - Free pages allocated for snapshot
834 */
835
836static void free_image_pages(void)
837{
838 struct pbe * p;
839
840 for_each_pbe (p, pagedir_save) {
841 if (p->address) {
842 ClearPageNosave(virt_to_page(p->address));
843 free_page(p->address);
844 p->address = 0;
845 }
846 }
847}
848
849/**
850 * alloc_image_pages - Allocate pages for the snapshot.
851 */
852
853static int alloc_image_pages(void)
854{
855 struct pbe * p;
856
857 for_each_pbe (p, pagedir_save) {
858 p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
859 if (!p->address)
860 return -ENOMEM;
861 SetPageNosave(virt_to_page(p->address));
862 }
863 return 0;
864}
865
866void swsusp_free(void)
867{
868 BUG_ON(PageNosave(virt_to_page(pagedir_save)));
869 BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
870 free_image_pages();
871 free_pagedir(pagedir_save);
872}
873
874
875/**
876 * enough_free_mem - Make sure we enough free memory to snapshot.
877 *
878 * Returns TRUE or FALSE after checking the number of available
879 * free pages.
880 */
881
882static int enough_free_mem(void)
883{
884 if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
885 pr_debug("swsusp: Not enough free pages: Have %d\n",
886 nr_free_pages());
887 return 0;
888 }
889 return 1;
890}
891
892
893/**
894 * enough_swap - Make sure we have enough swap to save the image.
895 *
896 * Returns TRUE or FALSE after checking the total amount of swap
897 * space avaiable.
898 *
899 * FIXME: si_swapinfo(&i) returns all swap devices information.
900 * We should only consider resume_device.
901 */
902
903static int enough_swap(void)
904{
905 struct sysinfo i;
906
907 si_swapinfo(&i);
908 if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO)) {
909 pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
910 return 0;
911 }
912 return 1;
913}
914
915static int swsusp_alloc(void)
916{
917 int error;
918
919 pagedir_nosave = NULL;
920 nr_copy_pages = calc_nr(nr_copy_pages);
921
922 pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
923 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
924
925 if (!enough_free_mem())
926 return -ENOMEM;
927
928 if (!enough_swap())
929 return -ENOSPC;
930
931 if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
932 printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
933 return -ENOMEM;
934 }
935 create_pbe_list(pagedir_save, nr_copy_pages);
936 pagedir_nosave = pagedir_save;
937 if ((error = alloc_image_pages())) {
938 printk(KERN_ERR "suspend: Allocating image pages failed.\n");
939 swsusp_free();
940 return error;
941 }
942
943 nr_copy_pages_check = nr_copy_pages;
944 return 0;
945}
946
947static int suspend_prepare_image(void)
948{
949 int error;
950
951 pr_debug("swsusp: critical section: \n");
952 if (save_highmem()) {
953 printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
954 restore_highmem();
955 return -ENOMEM;
956 }
957
958 drain_local_pages();
959 count_data_pages();
960 printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
961
962 error = swsusp_alloc();
963 if (error)
964 return error;
965
966 /* During allocating of suspend pagedir, new cold pages may appear.
967 * Kill them.
968 */
969 drain_local_pages();
970 copy_data_pages();
971
972 /*
973 * End of critical section. From now on, we can write to memory,
974 * but we should not touch disk. This specially means we must _not_
975 * touch swap space! Except we must write out our image of course.
976 */
977
978 printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
979 return 0;
980}
981
982
983/* It is important _NOT_ to umount filesystems at this point. We want 562/* It is important _NOT_ to umount filesystems at this point. We want
984 * them synced (in case something goes wrong) but we DO not want to mark 563 * them synced (in case something goes wrong) but we DO not want to mark
985 * filesystem clean: it is not. (And it does not matter, if we resume 564 * filesystem clean: it is not. (And it does not matter, if we resume
@@ -988,28 +567,24 @@ static int suspend_prepare_image(void)
988int swsusp_write(void) 567int swsusp_write(void)
989{ 568{
990 int error; 569 int error;
991 device_resume(); 570
571 if ((error = swsusp_swap_check())) {
572 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
573 return error;
574 }
992 lock_swapdevices(); 575 lock_swapdevices();
993 error = write_suspend_image(); 576 error = write_suspend_image();
994 /* This will unlock ignored swap devices since writing is finished */ 577 /* This will unlock ignored swap devices since writing is finished */
995 lock_swapdevices(); 578 lock_swapdevices();
996 return error; 579 return error;
997
998} 580}
999 581
1000 582
1001extern asmlinkage int swsusp_arch_suspend(void);
1002extern asmlinkage int swsusp_arch_resume(void);
1003
1004
1005asmlinkage int swsusp_save(void)
1006{
1007 return suspend_prepare_image();
1008}
1009 583
1010int swsusp_suspend(void) 584int swsusp_suspend(void)
1011{ 585{
1012 int error; 586 int error;
587
1013 if ((error = arch_prepare_suspend())) 588 if ((error = arch_prepare_suspend()))
1014 return error; 589 return error;
1015 local_irq_disable(); 590 local_irq_disable();
@@ -1021,15 +596,12 @@ int swsusp_suspend(void)
1021 */ 596 */
1022 if ((error = device_power_down(PMSG_FREEZE))) { 597 if ((error = device_power_down(PMSG_FREEZE))) {
1023 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); 598 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
1024 local_irq_enable(); 599 goto Enable_irqs;
1025 return error;
1026 } 600 }
1027 601
1028 if ((error = swsusp_swap_check())) { 602 if ((error = save_highmem())) {
1029 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); 603 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
1030 device_power_up(); 604 goto Restore_highmem;
1031 local_irq_enable();
1032 return error;
1033 } 605 }
1034 606
1035 save_processor_state(); 607 save_processor_state();
@@ -1037,9 +609,10 @@ int swsusp_suspend(void)
1037 printk(KERN_ERR "Error %d suspending\n", error); 609 printk(KERN_ERR "Error %d suspending\n", error);
1038 /* Restore control flow magically appears here */ 610 /* Restore control flow magically appears here */
1039 restore_processor_state(); 611 restore_processor_state();
1040 BUG_ON (nr_copy_pages_check != nr_copy_pages); 612Restore_highmem:
1041 restore_highmem(); 613 restore_highmem();
1042 device_power_up(); 614 device_power_up();
615Enable_irqs:
1043 local_irq_enable(); 616 local_irq_enable();
1044 return error; 617 return error;
1045} 618}
@@ -1057,6 +630,11 @@ int swsusp_resume(void)
1057 * execution continues at place where swsusp_arch_suspend was called 630 * execution continues at place where swsusp_arch_suspend was called
1058 */ 631 */
1059 BUG_ON(!error); 632 BUG_ON(!error);
633 /* The only reason why swsusp_arch_resume() can fail is memory being
634 * very tight, so we have to free it as soon as we can to avoid
635 * subsequent failures
636 */
637 swsusp_free();
1060 restore_processor_state(); 638 restore_processor_state();
1061 restore_highmem(); 639 restore_highmem();
1062 touch_softlockup_watchdog(); 640 touch_softlockup_watchdog();
@@ -1066,158 +644,43 @@ int swsusp_resume(void)
1066} 644}
1067 645
1068/** 646/**
1069 * On resume, for storing the PBE list and the image, 647 * mark_unsafe_pages - mark the pages that cannot be used for storing
1070 * we can only use memory pages that do not conflict with the pages 648 * the image during resume, because they conflict with the pages that
1071 * which had been used before suspend. 649 * had been used before suspend
1072 *
1073 * We don't know which pages are usable until we allocate them.
1074 *
1075 * Allocated but unusable (ie eaten) memory pages are linked together
1076 * to create a list, so that we can free them easily
1077 *
1078 * We could have used a type other than (void *)
1079 * for this purpose, but ...
1080 */ 650 */
1081static void **eaten_memory = NULL;
1082
1083static inline void eat_page(void *page)
1084{
1085 void **c;
1086 651
1087 c = eaten_memory; 652static void mark_unsafe_pages(struct pbe *pblist)
1088 eaten_memory = page;
1089 *eaten_memory = c;
1090}
1091
1092static unsigned long get_usable_page(unsigned gfp_mask)
1093{
1094 unsigned long m;
1095
1096 m = get_zeroed_page(gfp_mask);
1097 while (!PageNosaveFree(virt_to_page(m))) {
1098 eat_page((void *)m);
1099 m = get_zeroed_page(gfp_mask);
1100 if (!m)
1101 break;
1102 }
1103 return m;
1104}
1105
1106static void free_eaten_memory(void)
1107{
1108 unsigned long m;
1109 void **c;
1110 int i = 0;
1111
1112 c = eaten_memory;
1113 while (c) {
1114 m = (unsigned long)c;
1115 c = *c;
1116 free_page(m);
1117 i++;
1118 }
1119 eaten_memory = NULL;
1120 pr_debug("swsusp: %d unused pages freed\n", i);
1121}
1122
1123/**
1124 * check_pagedir - We ensure here that pages that the PBEs point to
1125 * won't collide with pages where we're going to restore from the loaded
1126 * pages later
1127 */
1128
1129static int check_pagedir(struct pbe *pblist)
1130{
1131 struct pbe *p;
1132
1133 /* This is necessary, so that we can free allocated pages
1134 * in case of failure
1135 */
1136 for_each_pbe (p, pblist)
1137 p->address = 0UL;
1138
1139 for_each_pbe (p, pblist) {
1140 p->address = get_usable_page(GFP_ATOMIC);
1141 if (!p->address)
1142 return -ENOMEM;
1143 }
1144 return 0;
1145}
1146
1147/**
1148 * swsusp_pagedir_relocate - It is possible, that some memory pages
1149 * occupied by the list of PBEs collide with pages where we're going to
1150 * restore from the loaded pages later. We relocate them here.
1151 */
1152
1153static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1154{ 653{
1155 struct zone *zone; 654 struct zone *zone;
1156 unsigned long zone_pfn; 655 unsigned long zone_pfn;
1157 struct pbe *pbpage, *tail, *p; 656 struct pbe *p;
1158 void *m;
1159 int rel = 0, error = 0;
1160 657
1161 if (!pblist) /* a sanity check */ 658 if (!pblist) /* a sanity check */
1162 return NULL; 659 return;
1163
1164 pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
1165 swsusp_info.pagedir_pages);
1166
1167 /* Set page flags */
1168 660
661 /* Clear page flags */
1169 for_each_zone (zone) { 662 for_each_zone (zone) {
1170 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 663 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
1171 SetPageNosaveFree(pfn_to_page(zone_pfn + 664 if (pfn_valid(zone_pfn + zone->zone_start_pfn))
665 ClearPageNosaveFree(pfn_to_page(zone_pfn +
1172 zone->zone_start_pfn)); 666 zone->zone_start_pfn));
1173 } 667 }
1174 668
1175 /* Clear orig addresses */ 669 /* Mark orig addresses */
1176
1177 for_each_pbe (p, pblist) 670 for_each_pbe (p, pblist)
1178 ClearPageNosaveFree(virt_to_page(p->orig_address)); 671 SetPageNosaveFree(virt_to_page(p->orig_address));
1179
1180 tail = pblist + PB_PAGE_SKIP;
1181
1182 /* Relocate colliding pages */
1183
1184 for_each_pb_page (pbpage, pblist) {
1185 if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
1186 m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
1187 if (!m) {
1188 error = -ENOMEM;
1189 break;
1190 }
1191 memcpy(m, (void *)pbpage, PAGE_SIZE);
1192 if (pbpage == pblist)
1193 pblist = (struct pbe *)m;
1194 else
1195 tail->next = (struct pbe *)m;
1196
1197 eat_page((void *)pbpage);
1198 pbpage = (struct pbe *)m;
1199
1200 /* We have to link the PBEs again */
1201 672
1202 for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++) 673}
1203 if (p->next) /* needed to save the end */
1204 p->next = p + 1;
1205
1206 rel++;
1207 }
1208 tail = pbpage + PB_PAGE_SKIP;
1209 }
1210 674
1211 if (error) { 675static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
1212 printk("\nswsusp: Out of memory\n\n"); 676{
1213 free_pagedir(pblist); 677 /* We assume both lists contain the same number of elements */
1214 free_eaten_memory(); 678 while (src) {
1215 pblist = NULL; 679 dst->orig_address = src->orig_address;
680 dst->swap_address = src->swap_address;
681 dst = dst->next;
682 src = src->next;
1216 } 683 }
1217 else
1218 printk("swsusp: Relocated %d pages\n", rel);
1219
1220 return pblist;
1221} 684}
1222 685
1223/* 686/*
@@ -1231,7 +694,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
1231 694
1232static atomic_t io_done = ATOMIC_INIT(0); 695static atomic_t io_done = ATOMIC_INIT(0);
1233 696
1234static int end_io(struct bio * bio, unsigned int num, int err) 697static int end_io(struct bio *bio, unsigned int num, int err)
1235{ 698{
1236 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 699 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1237 panic("I/O error reading memory image"); 700 panic("I/O error reading memory image");
@@ -1239,7 +702,7 @@ static int end_io(struct bio * bio, unsigned int num, int err)
1239 return 0; 702 return 0;
1240} 703}
1241 704
1242static struct block_device * resume_bdev; 705static struct block_device *resume_bdev;
1243 706
1244/** 707/**
1245 * submit - submit BIO request. 708 * submit - submit BIO request.
@@ -1252,10 +715,10 @@ static struct block_device * resume_bdev;
1252 * Then submit it and wait. 715 * Then submit it and wait.
1253 */ 716 */
1254 717
1255static int submit(int rw, pgoff_t page_off, void * page) 718static int submit(int rw, pgoff_t page_off, void *page)
1256{ 719{
1257 int error = 0; 720 int error = 0;
1258 struct bio * bio; 721 struct bio *bio;
1259 722
1260 bio = bio_alloc(GFP_ATOMIC, 1); 723 bio = bio_alloc(GFP_ATOMIC, 1);
1261 if (!bio) 724 if (!bio)
@@ -1284,12 +747,12 @@ static int submit(int rw, pgoff_t page_off, void * page)
1284 return error; 747 return error;
1285} 748}
1286 749
1287static int bio_read_page(pgoff_t page_off, void * page) 750static int bio_read_page(pgoff_t page_off, void *page)
1288{ 751{
1289 return submit(READ, page_off, page); 752 return submit(READ, page_off, page);
1290} 753}
1291 754
1292static int bio_write_page(pgoff_t page_off, void * page) 755static int bio_write_page(pgoff_t page_off, void *page)
1293{ 756{
1294 return submit(WRITE, page_off, page); 757 return submit(WRITE, page_off, page);
1295} 758}
@@ -1299,7 +762,7 @@ static int bio_write_page(pgoff_t page_off, void * page)
1299 * I really don't think that it's foolproof but more than nothing.. 762 * I really don't think that it's foolproof but more than nothing..
1300 */ 763 */
1301 764
1302static const char * sanity_check(void) 765static const char *sanity_check(void)
1303{ 766{
1304 dump_info(); 767 dump_info();
1305 if (swsusp_info.version_code != LINUX_VERSION_CODE) 768 if (swsusp_info.version_code != LINUX_VERSION_CODE)
@@ -1325,7 +788,7 @@ static const char * sanity_check(void)
1325 788
1326static int check_header(void) 789static int check_header(void)
1327{ 790{
1328 const char * reason = NULL; 791 const char *reason = NULL;
1329 int error; 792 int error;
1330 793
1331 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) 794 if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
@@ -1356,7 +819,7 @@ static int check_sig(void)
1356 * Reset swap signature now. 819 * Reset swap signature now.
1357 */ 820 */
1358 error = bio_write_page(0, &swsusp_header); 821 error = bio_write_page(0, &swsusp_header);
1359 } else { 822 } else {
1360 return -EINVAL; 823 return -EINVAL;
1361 } 824 }
1362 if (!error) 825 if (!error)
@@ -1373,7 +836,7 @@ static int check_sig(void)
1373 836
1374static int data_read(struct pbe *pblist) 837static int data_read(struct pbe *pblist)
1375{ 838{
1376 struct pbe * p; 839 struct pbe *p;
1377 int error = 0; 840 int error = 0;
1378 int i = 0; 841 int i = 0;
1379 int mod = swsusp_info.image_pages / 100; 842 int mod = swsusp_info.image_pages / 100;
@@ -1411,7 +874,7 @@ static int data_read(struct pbe *pblist)
1411static int read_pagedir(struct pbe *pblist) 874static int read_pagedir(struct pbe *pblist)
1412{ 875{
1413 struct pbe *pbpage, *p; 876 struct pbe *pbpage, *p;
1414 unsigned i = 0; 877 unsigned int i = 0;
1415 int error; 878 int error;
1416 879
1417 if (!pblist) 880 if (!pblist)
@@ -1433,10 +896,8 @@ static int read_pagedir(struct pbe *pblist)
1433 break; 896 break;
1434 } 897 }
1435 898
1436 if (error) 899 if (!error)
1437 free_page((unsigned long)pblist); 900 BUG_ON(i != swsusp_info.pagedir_pages);
1438
1439 BUG_ON(i != swsusp_info.pagedir_pages);
1440 901
1441 return error; 902 return error;
1442} 903}
@@ -1460,32 +921,29 @@ static int read_suspend_image(void)
1460 int error = 0; 921 int error = 0;
1461 struct pbe *p; 922 struct pbe *p;
1462 923
1463 if (!(p = alloc_pagedir(nr_copy_pages))) 924 if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
1464 return -ENOMEM; 925 return -ENOMEM;
1465 926
1466 if ((error = read_pagedir(p))) 927 if ((error = read_pagedir(p)))
1467 return error; 928 return error;
1468
1469 create_pbe_list(p, nr_copy_pages); 929 create_pbe_list(p, nr_copy_pages);
1470 930 mark_unsafe_pages(p);
1471 if (!(pagedir_nosave = swsusp_pagedir_relocate(p))) 931 pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
932 if (pagedir_nosave) {
933 create_pbe_list(pagedir_nosave, nr_copy_pages);
934 copy_page_backup_list(pagedir_nosave, p);
935 }
936 free_pagedir(p);
937 if (!pagedir_nosave)
1472 return -ENOMEM; 938 return -ENOMEM;
1473 939
1474 /* Allocate memory for the image and read the data from swap */ 940 /* Allocate memory for the image and read the data from swap */
1475 941
1476 error = check_pagedir(pagedir_nosave); 942 error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
1477 free_eaten_memory(); 943
1478 if (!error) 944 if (!error)
1479 error = data_read(pagedir_nosave); 945 error = data_read(pagedir_nosave);
1480 946
1481 if (error) { /* We fail cleanly */
1482 for_each_pbe (p, pagedir_nosave)
1483 if (p->address) {
1484 free_page(p->address);
1485 p->address = 0UL;
1486 }
1487 free_pagedir(pagedir_nosave);
1488 }
1489 return error; 947 return error;
1490} 948}
1491 949
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a4..5287be83e3e7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
10 * elsewhere, in preparation for a serial line console (someday). 10 * elsewhere, in preparation for a serial line console (someday).
11 * Ted Ts'o, 2/11/93. 11 * Ted Ts'o, 2/11/93.
12 * Modified for sysctl support, 1/8/97, Chris Horn. 12 * Modified for sysctl support, 1/8/97, Chris Horn.
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfreds@colorfullife.com 14 * manfreds@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
148 if (!strcmp(str, "ttyb")) 148 if (!strcmp(str, "ttyb"))
149 strcpy(name, "ttyS1"); 149 strcpy(name, "ttyS1");
150#endif 150#endif
151 for(s = name; *s; s++) 151 for (s = name; *s; s++)
152 if ((*s >= '0' && *s <= '9') || *s == ',') 152 if ((*s >= '0' && *s <= '9') || *s == ',')
153 break; 153 break;
154 idx = simple_strtoul(s, NULL, 10); 154 idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
169 size = roundup_pow_of_two(size); 169 size = roundup_pow_of_two(size);
170 if (size > log_buf_len) { 170 if (size > log_buf_len) {
171 unsigned long start, dest_idx, offset; 171 unsigned long start, dest_idx, offset;
172 char * new_log_buf; 172 char *new_log_buf;
173 173
174 new_log_buf = alloc_bootmem(size); 174 new_log_buf = alloc_bootmem(size);
175 if (!new_log_buf) { 175 if (!new_log_buf) {
176 printk("log_buf_len: allocation failed\n"); 176 printk(KERN_WARNING "log_buf_len: allocation failed\n");
177 goto out; 177 goto out;
178 } 178 }
179 179
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
193 log_end -= offset; 193 log_end -= offset;
194 spin_unlock_irqrestore(&logbuf_lock, flags); 194 spin_unlock_irqrestore(&logbuf_lock, flags);
195 195
196 printk("log_buf_len: %d\n", log_buf_len); 196 printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
197 } 197 }
198out: 198out:
199
200 return 1; 199 return 1;
201} 200}
202 201
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
217 * 9 -- Return number of unread characters in the log buffer 216 * 9 -- Return number of unread characters in the log buffer
218 * 10 -- Return size of the log buffer 217 * 10 -- Return size of the log buffer
219 */ 218 */
220int do_syslog(int type, char __user * buf, int len) 219int do_syslog(int type, char __user *buf, int len)
221{ 220{
222 unsigned long i, j, limit, count; 221 unsigned long i, j, limit, count;
223 int do_clear = 0; 222 int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
244 error = -EFAULT; 243 error = -EFAULT;
245 goto out; 244 goto out;
246 } 245 }
247 error = wait_event_interruptible(log_wait, (log_start - log_end)); 246 error = wait_event_interruptible(log_wait,
247 (log_start - log_end));
248 if (error) 248 if (error)
249 goto out; 249 goto out;
250 i = 0; 250 i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
264 error = i; 264 error = i;
265 break; 265 break;
266 case 4: /* Read/clear last kernel messages */ 266 case 4: /* Read/clear last kernel messages */
267 do_clear = 1; 267 do_clear = 1;
268 /* FALL THRU */ 268 /* FALL THRU */
269 case 3: /* Read last kernel messages */ 269 case 3: /* Read last kernel messages */
270 error = -EINVAL; 270 error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
288 limit = log_end; 288 limit = log_end;
289 /* 289 /*
290 * __put_user() could sleep, and while we sleep 290 * __put_user() could sleep, and while we sleep
291 * printk() could overwrite the messages 291 * printk() could overwrite the messages
292 * we try to copy to user space. Therefore 292 * we try to copy to user space. Therefore
293 * the messages are copied in reverse. <manfreds> 293 * the messages are copied in reverse. <manfreds>
294 */ 294 */
295 for(i = 0; i < count && !error; i++) { 295 for (i = 0; i < count && !error; i++) {
296 j = limit-1-i; 296 j = limit-1-i;
297 if (j + log_buf_len < log_end) 297 if (j + log_buf_len < log_end)
298 break; 298 break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
306 if (error) 306 if (error)
307 break; 307 break;
308 error = i; 308 error = i;
309 if(i != count) { 309 if (i != count) {
310 int offset = count-error; 310 int offset = count-error;
311 /* buffer overflow during copy, correct user buffer. */ 311 /* buffer overflow during copy, correct user buffer. */
312 for(i=0;i<error;i++) { 312 for (i = 0; i < error; i++) {
313 if (__get_user(c,&buf[i+offset]) || 313 if (__get_user(c,&buf[i+offset]) ||
314 __put_user(c,&buf[i])) { 314 __put_user(c,&buf[i])) {
315 error = -EFAULT; 315 error = -EFAULT;
@@ -351,7 +351,7 @@ out:
351 return error; 351 return error;
352} 352}
353 353
354asmlinkage long sys_syslog(int type, char __user * buf, int len) 354asmlinkage long sys_syslog(int type, char __user *buf, int len)
355{ 355{
356 return do_syslog(type, buf, len); 356 return do_syslog(type, buf, len);
357} 357}
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
404 cur_index = start; 404 cur_index = start;
405 start_print = start; 405 start_print = start;
406 while (cur_index != end) { 406 while (cur_index != end) {
407 if ( msg_level < 0 && 407 if (msg_level < 0 && ((end - cur_index) > 2) &&
408 ((end - cur_index) > 2) && 408 LOG_BUF(cur_index + 0) == '<' &&
409 LOG_BUF(cur_index + 0) == '<' && 409 LOG_BUF(cur_index + 1) >= '0' &&
410 LOG_BUF(cur_index + 1) >= '0' && 410 LOG_BUF(cur_index + 1) <= '7' &&
411 LOG_BUF(cur_index + 1) <= '7' && 411 LOG_BUF(cur_index + 2) == '>') {
412 LOG_BUF(cur_index + 2) == '>')
413 {
414 msg_level = LOG_BUF(cur_index + 1) - '0'; 412 msg_level = LOG_BUF(cur_index + 1) - '0';
415 cur_index += 3; 413 cur_index += 3;
416 start_print = cur_index; 414 start_print = cur_index;
417 } 415 }
418 while (cur_index != end) { 416 while (cur_index != end) {
419 char c = LOG_BUF(cur_index); 417 char c = LOG_BUF(cur_index);
420 cur_index++;
421 418
419 cur_index++;
422 if (c == '\n') { 420 if (c == '\n') {
423 if (msg_level < 0) { 421 if (msg_level < 0) {
424 /* 422 /*
@@ -461,7 +459,7 @@ static void zap_locks(void)
461 static unsigned long oops_timestamp; 459 static unsigned long oops_timestamp;
462 460
463 if (time_after_eq(jiffies, oops_timestamp) && 461 if (time_after_eq(jiffies, oops_timestamp) &&
464 !time_after(jiffies, oops_timestamp + 30*HZ)) 462 !time_after(jiffies, oops_timestamp + 30 * HZ))
465 return; 463 return;
466 464
467 oops_timestamp = jiffies; 465 oops_timestamp = jiffies;
@@ -493,9 +491,12 @@ __attribute__((weak)) unsigned long long printk_clock(void)
493 return sched_clock(); 491 return sched_clock();
494} 492}
495 493
496/* 494/**
495 * printk - print a kernel message
496 * @fmt: format string
497 *
497 * This is printk. It can be called from any context. We want it to work. 498 * This is printk. It can be called from any context. We want it to work.
498 * 499 *
499 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 500 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
500 * call the console drivers. If we fail to get the semaphore we place the output 501 * call the console drivers. If we fail to get the semaphore we place the output
501 * into the log buffer and return. The current holder of the console_sem will 502 * into the log buffer and return. The current holder of the console_sem will
@@ -505,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void)
505 * One effect of this deferred printing is that code which calls printk() and 506 * One effect of this deferred printing is that code which calls printk() and
506 * then changes console_loglevel may break. This is because console_loglevel 507 * then changes console_loglevel may break. This is because console_loglevel
507 * is inspected when the actual printing occurs. 508 * is inspected when the actual printing occurs.
509 *
510 * See also:
511 * printf(3)
508 */ 512 */
509 513
510asmlinkage int printk(const char *fmt, ...) 514asmlinkage int printk(const char *fmt, ...)
@@ -639,18 +643,27 @@ EXPORT_SYMBOL(vprintk);
639 643
640#else 644#else
641 645
642asmlinkage long sys_syslog(int type, char __user * buf, int len) 646asmlinkage long sys_syslog(int type, char __user *buf, int len)
643{ 647{
644 return 0; 648 return 0;
645} 649}
646 650
647int do_syslog(int type, char __user * buf, int len) { return 0; } 651int do_syslog(int type, char __user *buf, int len)
648static void call_console_drivers(unsigned long start, unsigned long end) {} 652{
653 return 0;
654}
655
656static void call_console_drivers(unsigned long start, unsigned long end)
657{
658}
649 659
650#endif 660#endif
651 661
652/** 662/**
653 * add_preferred_console - add a device to the list of preferred consoles. 663 * add_preferred_console - add a device to the list of preferred consoles.
664 * @name: device name
665 * @idx: device index
666 * @options: options for this console
654 * 667 *
655 * The last preferred console added will be used for kernel messages 668 * The last preferred console added will be used for kernel messages
656 * and stdin/out/err for init. Normally this is used by console_setup 669 * and stdin/out/err for init. Normally this is used by console_setup
@@ -760,7 +773,8 @@ void release_console_sem(void)
760} 773}
761EXPORT_SYMBOL(release_console_sem); 774EXPORT_SYMBOL(release_console_sem);
762 775
763/** console_conditional_schedule - yield the CPU if required 776/**
777 * console_conditional_schedule - yield the CPU if required
764 * 778 *
765 * If the console code is currently allowed to sleep, and 779 * If the console code is currently allowed to sleep, and
766 * if this CPU should yield the CPU to another task, do 780 * if this CPU should yield the CPU to another task, do
@@ -802,7 +816,6 @@ void console_unblank(void)
802 c->unblank(); 816 c->unblank();
803 release_console_sem(); 817 release_console_sem();
804} 818}
805EXPORT_SYMBOL(console_unblank);
806 819
807/* 820/*
808 * Return the console tty driver structure and its associated index 821 * Return the console tty driver structure and its associated index
@@ -851,9 +864,9 @@ EXPORT_SYMBOL(console_start);
851 * print any messages that were printed by the kernel before the 864 * print any messages that were printed by the kernel before the
852 * console driver was initialized. 865 * console driver was initialized.
853 */ 866 */
854void register_console(struct console * console) 867void register_console(struct console *console)
855{ 868{
856 int i; 869 int i;
857 unsigned long flags; 870 unsigned long flags;
858 871
859 if (preferred_console < 0) 872 if (preferred_console < 0)
@@ -878,7 +891,8 @@ void register_console(struct console * console)
878 * See if this console matches one we selected on 891 * See if this console matches one we selected on
879 * the command line. 892 * the command line.
880 */ 893 */
881 for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) { 894 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
895 i++) {
882 if (strcmp(console_cmdline[i].name, console->name) != 0) 896 if (strcmp(console_cmdline[i].name, console->name) != 0)
883 continue; 897 continue;
884 if (console->index >= 0 && 898 if (console->index >= 0 &&
@@ -933,26 +947,26 @@ void register_console(struct console * console)
933} 947}
934EXPORT_SYMBOL(register_console); 948EXPORT_SYMBOL(register_console);
935 949
936int unregister_console(struct console * console) 950int unregister_console(struct console *console)
937{ 951{
938 struct console *a,*b; 952 struct console *a, *b;
939 int res = 1; 953 int res = 1;
940 954
941 acquire_console_sem(); 955 acquire_console_sem();
942 if (console_drivers == console) { 956 if (console_drivers == console) {
943 console_drivers=console->next; 957 console_drivers=console->next;
944 res = 0; 958 res = 0;
945 } else { 959 } else if (console_drivers) {
946 for (a=console_drivers->next, b=console_drivers ; 960 for (a=console_drivers->next, b=console_drivers ;
947 a; b=a, a=b->next) { 961 a; b=a, a=b->next) {
948 if (a == console) { 962 if (a == console) {
949 b->next = a->next; 963 b->next = a->next;
950 res = 0; 964 res = 0;
951 break; 965 break;
952 } 966 }
953 } 967 }
954 } 968 }
955 969
956 /* If last console is removed, we re-enable picking the first 970 /* If last console is removed, we re-enable picking the first
957 * one that gets registered. Without that, pmac early boot console 971 * one that gets registered. Without that, pmac early boot console
958 * would prevent fbcon from taking over. 972 * would prevent fbcon from taking over.
@@ -972,6 +986,8 @@ EXPORT_SYMBOL(unregister_console);
972 986
973/** 987/**
974 * tty_write_message - write a message to a certain tty, not just the console. 988 * tty_write_message - write a message to a certain tty, not just the console.
989 * @tty: the destination tty_struct
990 * @msg: the message to write
975 * 991 *
976 * This is used for messages that need to be redirected to a specific tty. 992 * This is used for messages that need to be redirected to a specific tty.
977 * We don't put it into the syslog queue right now maybe in the future if 993 * We don't put it into the syslog queue right now maybe in the future if
@@ -994,7 +1010,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
994int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1010int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
995{ 1011{
996 static DEFINE_SPINLOCK(ratelimit_lock); 1012 static DEFINE_SPINLOCK(ratelimit_lock);
997 static unsigned long toks = 10*5*HZ; 1013 static unsigned long toks = 10 * 5 * HZ;
998 static unsigned long last_msg; 1014 static unsigned long last_msg;
999 static int missed; 1015 static int missed;
1000 unsigned long flags; 1016 unsigned long flags;
@@ -1007,6 +1023,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1007 toks = ratelimit_burst * ratelimit_jiffies; 1023 toks = ratelimit_burst * ratelimit_jiffies;
1008 if (toks >= ratelimit_jiffies) { 1024 if (toks >= ratelimit_jiffies) {
1009 int lost = missed; 1025 int lost = missed;
1026
1010 missed = 0; 1027 missed = 0;
1011 toks -= ratelimit_jiffies; 1028 toks -= ratelimit_jiffies;
1012 spin_unlock_irqrestore(&ratelimit_lock, flags); 1029 spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1038,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
1021EXPORT_SYMBOL(__printk_ratelimit); 1038EXPORT_SYMBOL(__printk_ratelimit);
1022 1039
1023/* minimum time in jiffies between messages */ 1040/* minimum time in jiffies between messages */
1024int printk_ratelimit_jiffies = 5*HZ; 1041int printk_ratelimit_jiffies = 5 * HZ;
1025 1042
1026/* number of messages we send before ratelimiting */ 1043/* number of messages we send before ratelimiting */
1027int printk_ratelimit_burst = 10; 1044int printk_ratelimit_burst = 10;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065a..656476eedb1b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
56 signal_wake_up(child, 1); 56 signal_wake_up(child, 1);
57 } 57 }
58 } 58 }
59 if (child->signal->flags & SIGNAL_GROUP_EXIT) {
60 sigaddset(&child->pending.signal, SIGKILL);
61 signal_wake_up(child, 1);
62 }
59 spin_unlock(&child->sighand->siglock); 63 spin_unlock(&child->sighand->siglock);
60} 64}
61 65
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
77 SET_LINKS(child); 81 SET_LINKS(child);
78 } 82 }
79 83
80 if (child->state == TASK_TRACED) 84 ptrace_untrace(child);
81 ptrace_untrace(child);
82} 85}
83 86
84/* 87/*
@@ -152,7 +155,7 @@ int ptrace_attach(struct task_struct *task)
152 retval = -EPERM; 155 retval = -EPERM;
153 if (task->pid <= 1) 156 if (task->pid <= 1)
154 goto bad; 157 goto bad;
155 if (task == current) 158 if (task->tgid == current->tgid)
156 goto bad; 159 goto bad;
157 /* the same process cannot be attached many times */ 160 /* the same process cannot be attached many times */
158 if (task->ptrace & PT_PTRACED) 161 if (task->ptrace & PT_PTRACED)
@@ -238,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
238 if (write) { 241 if (write) {
239 copy_to_user_page(vma, page, addr, 242 copy_to_user_page(vma, page, addr,
240 maddr + offset, buf, bytes); 243 maddr + offset, buf, bytes);
241 set_page_dirty_lock(page); 244 if (!PageCompound(page))
245 set_page_dirty_lock(page);
242 } else { 246 } else {
243 copy_from_user_page(vma, page, addr, 247 copy_from_user_page(vma, page, addr,
244 buf, maddr + offset, bytes); 248 buf, maddr + offset, bytes);
@@ -403,3 +407,85 @@ int ptrace_request(struct task_struct *child, long request,
403 407
404 return ret; 408 return ret;
405} 409}
410
411#ifndef __ARCH_SYS_PTRACE
412static int ptrace_get_task_struct(long request, long pid,
413 struct task_struct **childp)
414{
415 struct task_struct *child;
416 int ret;
417
418 /*
419 * Callers use child == NULL as an indication to exit early even
420 * when the return value is 0, so make sure it is non-NULL here.
421 */
422 *childp = NULL;
423
424 if (request == PTRACE_TRACEME) {
425 /*
426 * Are we already being traced?
427 */
428 if (current->ptrace & PT_PTRACED)
429 return -EPERM;
430 ret = security_ptrace(current->parent, current);
431 if (ret)
432 return -EPERM;
433 /*
434 * Set the ptrace bit in the process ptrace flags.
435 */
436 current->ptrace |= PT_PTRACED;
437 return 0;
438 }
439
440 /*
441 * You may not mess with init
442 */
443 if (pid == 1)
444 return -EPERM;
445
446 ret = -ESRCH;
447 read_lock(&tasklist_lock);
448 child = find_task_by_pid(pid);
449 if (child)
450 get_task_struct(child);
451 read_unlock(&tasklist_lock);
452 if (!child)
453 return -ESRCH;
454
455 *childp = child;
456 return 0;
457}
458
459asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
460{
461 struct task_struct *child;
462 long ret;
463
464 /*
465 * This lock_kernel fixes a subtle race with suid exec
466 */
467 lock_kernel();
468 ret = ptrace_get_task_struct(request, pid, &child);
469 if (!child)
470 goto out;
471
472 if (request == PTRACE_ATTACH) {
473 ret = ptrace_attach(child);
474 goto out_put_task_struct;
475 }
476
477 ret = ptrace_check_attach(child, request == PTRACE_KILL);
478 if (ret < 0)
479 goto out_put_task_struct;
480
481 ret = arch_ptrace(child, request, addr, data);
482 if (ret < 0)
483 goto out_put_task_struct;
484
485 out_put_task_struct:
486 put_task_struct(child);
487 out:
488 unlock_kernel();
489 return ret;
490}
491#endif /* __ARCH_SYS_PTRACE */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bef3b6901b76..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -71,7 +71,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
71 71
72/* Fake initialization required by compiler */ 72/* Fake initialization required by compiler */
73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
74static int maxbatch = 10; 74static int maxbatch = 10000;
75 75
76#ifndef __HAVE_ARCH_CMPXCHG 76#ifndef __HAVE_ARCH_CMPXCHG
77/* 77/*
@@ -109,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
109 rdp = &__get_cpu_var(rcu_data); 109 rdp = &__get_cpu_var(rcu_data);
110 *rdp->nxttail = head; 110 *rdp->nxttail = head;
111 rdp->nxttail = &head->next; 111 rdp->nxttail = &head->next;
112
113 if (unlikely(++rdp->count > 10000))
114 set_need_resched();
115
112 local_irq_restore(flags); 116 local_irq_restore(flags);
113} 117}
114 118
@@ -140,10 +144,25 @@ void fastcall call_rcu_bh(struct rcu_head *head,
140 rdp = &__get_cpu_var(rcu_bh_data); 144 rdp = &__get_cpu_var(rcu_bh_data);
141 *rdp->nxttail = head; 145 *rdp->nxttail = head;
142 rdp->nxttail = &head->next; 146 rdp->nxttail = &head->next;
147 rdp->count++;
148/*
149 * Should we directly call rcu_do_batch() here ?
150 * if (unlikely(rdp->count > 10000))
151 * rcu_do_batch(rdp);
152 */
143 local_irq_restore(flags); 153 local_irq_restore(flags);
144} 154}
145 155
146/* 156/*
157 * Return the number of RCU batches processed thus far. Useful
158 * for debug and statistics.
159 */
160long rcu_batches_completed(void)
161{
162 return rcu_ctrlblk.completed;
163}
164
165/*
147 * Invoke the completed RCU callbacks. They are expected to be in 166 * Invoke the completed RCU callbacks. They are expected to be in
148 * a per-cpu list. 167 * a per-cpu list.
149 */ 168 */
@@ -157,6 +176,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
157 next = rdp->donelist = list->next; 176 next = rdp->donelist = list->next;
158 list->func(list); 177 list->func(list);
159 list = next; 178 list = next;
179 rdp->count--;
160 if (++count >= maxbatch) 180 if (++count >= maxbatch)
161 break; 181 break;
162 } 182 }
@@ -490,6 +510,7 @@ void synchronize_kernel(void)
490} 510}
491 511
492module_param(maxbatch, int, 0); 512module_param(maxbatch, int, 0);
513EXPORT_SYMBOL_GPL(rcu_batches_completed);
493EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */ 514EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
494EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 515EXPORT_SYMBOL(call_rcu_bh); /* WARNING: GPL-only in April 2006. */
495EXPORT_SYMBOL_GPL(synchronize_rcu); 516EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..88c28d476550
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,514 @@
1/*
2 * Read-Copy Update /proc-based torture test facility
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2005
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 *
22 * See also: Documentation/RCU/torture.txt
23 */
24#include <linux/types.h>
25#include <linux/kernel.h>
26#include <linux/init.h>
27#include <linux/module.h>
28#include <linux/kthread.h>
29#include <linux/err.h>
30#include <linux/spinlock.h>
31#include <linux/smp.h>
32#include <linux/rcupdate.h>
33#include <linux/interrupt.h>
34#include <linux/sched.h>
35#include <asm/atomic.h>
36#include <linux/bitops.h>
37#include <linux/module.h>
38#include <linux/completion.h>
39#include <linux/moduleparam.h>
40#include <linux/percpu.h>
41#include <linux/notifier.h>
42#include <linux/rcuref.h>
43#include <linux/cpu.h>
44#include <linux/random.h>
45#include <linux/delay.h>
46#include <linux/byteorder/swabb.h>
47#include <linux/stat.h>
48
49MODULE_LICENSE("GPL");
50
51static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */
52static int stat_interval = 0; /* Interval between stats, in seconds. */
53 /* Defaults to "only at end of test". */
54static int verbose = 0; /* Print more debug info. */
55
56MODULE_PARM(nreaders, "i");
57MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
58MODULE_PARM(stat_interval, "i");
59MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
60MODULE_PARM(verbose, "i");
61MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
62#define TORTURE_FLAG "rcutorture: "
63#define PRINTK_STRING(s) \
64 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
65#define VERBOSE_PRINTK_STRING(s) \
66 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
67#define VERBOSE_PRINTK_ERRSTRING(s) \
68 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
69
70static char printk_buf[4096];
71
72static int nrealreaders;
73static struct task_struct *writer_task;
74static struct task_struct **reader_tasks;
75static struct task_struct *stats_task;
76
77#define RCU_TORTURE_PIPE_LEN 10
78
79struct rcu_torture {
80 struct rcu_head rtort_rcu;
81 int rtort_pipe_count;
82 struct list_head rtort_free;
83 int rtort_mbtest;
84};
85
86static int fullstop = 0; /* stop generating callbacks at test end. */
87static LIST_HEAD(rcu_torture_freelist);
88static struct rcu_torture *rcu_torture_current = NULL;
89static long rcu_torture_current_version = 0;
90static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
91static DEFINE_SPINLOCK(rcu_torture_lock);
92static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
93 { 0 };
94static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
95 { 0 };
96static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
97atomic_t n_rcu_torture_alloc;
98atomic_t n_rcu_torture_alloc_fail;
99atomic_t n_rcu_torture_free;
100atomic_t n_rcu_torture_mberror;
101atomic_t n_rcu_torture_error;
102
103/*
104 * Allocate an element from the rcu_tortures pool.
105 */
106struct rcu_torture *
107rcu_torture_alloc(void)
108{
109 struct list_head *p;
110
111 spin_lock(&rcu_torture_lock);
112 if (list_empty(&rcu_torture_freelist)) {
113 atomic_inc(&n_rcu_torture_alloc_fail);
114 spin_unlock(&rcu_torture_lock);
115 return NULL;
116 }
117 atomic_inc(&n_rcu_torture_alloc);
118 p = rcu_torture_freelist.next;
119 list_del_init(p);
120 spin_unlock(&rcu_torture_lock);
121 return container_of(p, struct rcu_torture, rtort_free);
122}
123
124/*
125 * Free an element to the rcu_tortures pool.
126 */
127static void
128rcu_torture_free(struct rcu_torture *p)
129{
130 atomic_inc(&n_rcu_torture_free);
131 spin_lock(&rcu_torture_lock);
132 list_add_tail(&p->rtort_free, &rcu_torture_freelist);
133 spin_unlock(&rcu_torture_lock);
134}
135
136static void
137rcu_torture_cb(struct rcu_head *p)
138{
139 int i;
140 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
141
142 if (fullstop) {
143 /* Test is ending, just drop callbacks on the floor. */
144 /* The next initialization will pick up the pieces. */
145 return;
146 }
147 i = rp->rtort_pipe_count;
148 if (i > RCU_TORTURE_PIPE_LEN)
149 i = RCU_TORTURE_PIPE_LEN;
150 atomic_inc(&rcu_torture_wcount[i]);
151 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
152 rp->rtort_mbtest = 0;
153 rcu_torture_free(rp);
154 } else
155 call_rcu(p, rcu_torture_cb);
156}
157
158struct rcu_random_state {
159 unsigned long rrs_state;
160 unsigned long rrs_count;
161};
162
163#define RCU_RANDOM_MULT 39916801 /* prime */
164#define RCU_RANDOM_ADD 479001701 /* prime */
165#define RCU_RANDOM_REFRESH 10000
166
167#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
168
169/*
170 * Crude but fast random-number generator. Uses a linear congruential
171 * generator, with occasional help from get_random_bytes().
172 */
173static long
174rcu_random(struct rcu_random_state *rrsp)
175{
176 long refresh;
177
178 if (--rrsp->rrs_count < 0) {
179 get_random_bytes(&refresh, sizeof(refresh));
180 rrsp->rrs_state += refresh;
181 rrsp->rrs_count = RCU_RANDOM_REFRESH;
182 }
183 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
184 return swahw32(rrsp->rrs_state);
185}
186
187/*
188 * RCU torture writer kthread. Repeatedly substitutes a new structure
189 * for that pointed to by rcu_torture_current, freeing the old structure
190 * after a series of grace periods (the "pipeline").
191 */
192static int
193rcu_torture_writer(void *arg)
194{
195 int i;
196 long oldbatch = rcu_batches_completed();
197 struct rcu_torture *rp;
198 struct rcu_torture *old_rp;
199 static DEFINE_RCU_RANDOM(rand);
200
201 VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
202 set_user_nice(current, 19);
203
204 do {
205 schedule_timeout_uninterruptible(1);
206 if (rcu_batches_completed() == oldbatch)
207 continue;
208 if ((rp = rcu_torture_alloc()) == NULL)
209 continue;
210 rp->rtort_pipe_count = 0;
211 udelay(rcu_random(&rand) & 0x3ff);
212 old_rp = rcu_torture_current;
213 rp->rtort_mbtest = 1;
214 rcu_assign_pointer(rcu_torture_current, rp);
215 smp_wmb();
216 if (old_rp != NULL) {
217 i = old_rp->rtort_pipe_count;
218 if (i > RCU_TORTURE_PIPE_LEN)
219 i = RCU_TORTURE_PIPE_LEN;
220 atomic_inc(&rcu_torture_wcount[i]);
221 old_rp->rtort_pipe_count++;
222 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
223 }
224 rcu_torture_current_version++;
225 oldbatch = rcu_batches_completed();
226 } while (!kthread_should_stop() && !fullstop);
227 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
228 while (!kthread_should_stop())
229 schedule_timeout_uninterruptible(1);
230 return 0;
231}
232
233/*
234 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
235 * incrementing the corresponding element of the pipeline array. The
236 * counter in the element should never be greater than 1, otherwise, the
237 * RCU implementation is broken.
238 */
239static int
240rcu_torture_reader(void *arg)
241{
242 int completed;
243 DEFINE_RCU_RANDOM(rand);
244 struct rcu_torture *p;
245 int pipe_count;
246
247 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
248 set_user_nice(current, 19);
249
250 do {
251 rcu_read_lock();
252 completed = rcu_batches_completed();
253 p = rcu_dereference(rcu_torture_current);
254 if (p == NULL) {
255 /* Wait for rcu_torture_writer to get underway */
256 rcu_read_unlock();
257 schedule_timeout_interruptible(HZ);
258 continue;
259 }
260 if (p->rtort_mbtest == 0)
261 atomic_inc(&n_rcu_torture_mberror);
262 udelay(rcu_random(&rand) & 0x7f);
263 preempt_disable();
264 pipe_count = p->rtort_pipe_count;
265 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
266 /* Should not happen, but... */
267 pipe_count = RCU_TORTURE_PIPE_LEN;
268 }
269 ++__get_cpu_var(rcu_torture_count)[pipe_count];
270 completed = rcu_batches_completed() - completed;
271 if (completed > RCU_TORTURE_PIPE_LEN) {
272 /* Should not happen, but... */
273 completed = RCU_TORTURE_PIPE_LEN;
274 }
275 ++__get_cpu_var(rcu_torture_batch)[completed];
276 preempt_enable();
277 rcu_read_unlock();
278 schedule();
279 } while (!kthread_should_stop() && !fullstop);
280 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
281 while (!kthread_should_stop())
282 schedule_timeout_uninterruptible(1);
283 return 0;
284}
285
286/*
287 * Create an RCU-torture statistics message in the specified buffer.
288 */
289static int
290rcu_torture_printk(char *page)
291{
292 int cnt = 0;
293 int cpu;
294 int i;
295 long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
296 long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
297
298 for_each_cpu(cpu) {
299 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
300 pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
301 batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
302 }
303 }
304 for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
305 if (pipesummary[i] != 0)
306 break;
307 }
308 cnt += sprintf(&page[cnt], "rcutorture: ");
309 cnt += sprintf(&page[cnt],
310 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
311 "rtmbe: %d",
312 rcu_torture_current,
313 rcu_torture_current_version,
314 list_empty(&rcu_torture_freelist),
315 atomic_read(&n_rcu_torture_alloc),
316 atomic_read(&n_rcu_torture_alloc_fail),
317 atomic_read(&n_rcu_torture_free),
318 atomic_read(&n_rcu_torture_mberror));
319 if (atomic_read(&n_rcu_torture_mberror) != 0)
320 cnt += sprintf(&page[cnt], " !!!");
321 cnt += sprintf(&page[cnt], "\nrcutorture: ");
322 if (i > 1) {
323 cnt += sprintf(&page[cnt], "!!! ");
324 atomic_inc(&n_rcu_torture_error);
325 }
326 cnt += sprintf(&page[cnt], "Reader Pipe: ");
327 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
328 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
329 cnt += sprintf(&page[cnt], "\nrcutorture: ");
330 cnt += sprintf(&page[cnt], "Reader Batch: ");
331 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
332 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
333 cnt += sprintf(&page[cnt], "\nrcutorture: ");
334 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
335 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
336 cnt += sprintf(&page[cnt], " %d",
337 atomic_read(&rcu_torture_wcount[i]));
338 }
339 cnt += sprintf(&page[cnt], "\n");
340 return cnt;
341}
342
343/*
344 * Print torture statistics. Caller must ensure that there is only
345 * one call to this function at a given time!!! This is normally
346 * accomplished by relying on the module system to only have one copy
347 * of the module loaded, and then by giving the rcu_torture_stats
348 * kthread full control (or the init/cleanup functions when rcu_torture_stats
349 * thread is not running).
350 */
351static void
352rcu_torture_stats_print(void)
353{
354 int cnt;
355
356 cnt = rcu_torture_printk(printk_buf);
357 printk(KERN_ALERT "%s", printk_buf);
358}
359
360/*
361 * Periodically prints torture statistics, if periodic statistics printing
362 * was specified via the stat_interval module parameter.
363 *
364 * No need to worry about fullstop here, since this one doesn't reference
365 * volatile state or register callbacks.
366 */
367static int
368rcu_torture_stats(void *arg)
369{
370 VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
371 do {
372 schedule_timeout_interruptible(stat_interval * HZ);
373 rcu_torture_stats_print();
374 } while (!kthread_should_stop());
375 VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
376 return 0;
377}
378
379static void
380rcu_torture_cleanup(void)
381{
382 int i;
383
384 fullstop = 1;
385 if (writer_task != NULL) {
386 VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
387 kthread_stop(writer_task);
388 }
389 writer_task = NULL;
390
391 if (reader_tasks != NULL) {
392 for (i = 0; i < nrealreaders; i++) {
393 if (reader_tasks[i] != NULL) {
394 VERBOSE_PRINTK_STRING(
395 "Stopping rcu_torture_reader task");
396 kthread_stop(reader_tasks[i]);
397 }
398 reader_tasks[i] = NULL;
399 }
400 kfree(reader_tasks);
401 reader_tasks = NULL;
402 }
403 rcu_torture_current = NULL;
404
405 if (stats_task != NULL) {
406 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
407 kthread_stop(stats_task);
408 }
409 stats_task = NULL;
410
411 /* Wait for all RCU callbacks to fire. */
412
413 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
414 synchronize_rcu();
415 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
416 printk(KERN_ALERT TORTURE_FLAG
417 "--- End of test: %s\n",
418 atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
419}
420
421static int
422rcu_torture_init(void)
423{
424 int i;
425 int cpu;
426 int firsterr = 0;
427
428 /* Process args and tell the world that the torturer is on the job. */
429
430 if (nreaders >= 0)
431 nrealreaders = nreaders;
432 else
433 nrealreaders = 2 * num_online_cpus();
434 printk(KERN_ALERT TORTURE_FLAG
435 "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
436 nrealreaders, stat_interval, verbose);
437 fullstop = 0;
438
439 /* Set up the freelist. */
440
441 INIT_LIST_HEAD(&rcu_torture_freelist);
442 for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
443 rcu_tortures[i].rtort_mbtest = 0;
444 list_add_tail(&rcu_tortures[i].rtort_free,
445 &rcu_torture_freelist);
446 }
447
448 /* Initialize the statistics so that each run gets its own numbers. */
449
450 rcu_torture_current = NULL;
451 rcu_torture_current_version = 0;
452 atomic_set(&n_rcu_torture_alloc, 0);
453 atomic_set(&n_rcu_torture_alloc_fail, 0);
454 atomic_set(&n_rcu_torture_free, 0);
455 atomic_set(&n_rcu_torture_mberror, 0);
456 atomic_set(&n_rcu_torture_error, 0);
457 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
458 atomic_set(&rcu_torture_wcount[i], 0);
459 for_each_cpu(cpu) {
460 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
461 per_cpu(rcu_torture_count, cpu)[i] = 0;
462 per_cpu(rcu_torture_batch, cpu)[i] = 0;
463 }
464 }
465
466 /* Start up the kthreads. */
467
468 VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
469 writer_task = kthread_run(rcu_torture_writer, NULL,
470 "rcu_torture_writer");
471 if (IS_ERR(writer_task)) {
472 firsterr = PTR_ERR(writer_task);
473 VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
474 writer_task = NULL;
475 goto unwind;
476 }
477 reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
478 GFP_KERNEL);
479 if (reader_tasks == NULL) {
480 VERBOSE_PRINTK_ERRSTRING("out of memory");
481 firsterr = -ENOMEM;
482 goto unwind;
483 }
484 for (i = 0; i < nrealreaders; i++) {
485 VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
486 reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
487 "rcu_torture_reader");
488 if (IS_ERR(reader_tasks[i])) {
489 firsterr = PTR_ERR(reader_tasks[i]);
490 VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
491 reader_tasks[i] = NULL;
492 goto unwind;
493 }
494 }
495 if (stat_interval > 0) {
496 VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
497 stats_task = kthread_run(rcu_torture_stats, NULL,
498 "rcu_torture_stats");
499 if (IS_ERR(stats_task)) {
500 firsterr = PTR_ERR(stats_task);
501 VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
502 stats_task = NULL;
503 goto unwind;
504 }
505 }
506 return 0;
507
508unwind:
509 rcu_torture_cleanup();
510 return firsterr;
511}
512
513module_init(rcu_torture_init);
514module_exit(rcu_torture_cleanup);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1f31a528fdba..6f46c94cc29e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,6 +206,7 @@ struct runqueue {
206 */ 206 */
207 unsigned long nr_running; 207 unsigned long nr_running;
208#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
209 unsigned long prio_bias;
209 unsigned long cpu_load[3]; 210 unsigned long cpu_load[3];
210#endif 211#endif
211 unsigned long long nr_switches; 212 unsigned long long nr_switches;
@@ -659,13 +660,68 @@ static int effective_prio(task_t *p)
659 return prio; 660 return prio;
660} 661}
661 662
663#ifdef CONFIG_SMP
664static inline void inc_prio_bias(runqueue_t *rq, int prio)
665{
666 rq->prio_bias += MAX_PRIO - prio;
667}
668
669static inline void dec_prio_bias(runqueue_t *rq, int prio)
670{
671 rq->prio_bias -= MAX_PRIO - prio;
672}
673
674static inline void inc_nr_running(task_t *p, runqueue_t *rq)
675{
676 rq->nr_running++;
677 if (rt_task(p)) {
678 if (p != rq->migration_thread)
679 /*
680 * The migration thread does the actual balancing. Do
681 * not bias by its priority as the ultra high priority
682 * will skew balancing adversely.
683 */
684 inc_prio_bias(rq, p->prio);
685 } else
686 inc_prio_bias(rq, p->static_prio);
687}
688
689static inline void dec_nr_running(task_t *p, runqueue_t *rq)
690{
691 rq->nr_running--;
692 if (rt_task(p)) {
693 if (p != rq->migration_thread)
694 dec_prio_bias(rq, p->prio);
695 } else
696 dec_prio_bias(rq, p->static_prio);
697}
698#else
699static inline void inc_prio_bias(runqueue_t *rq, int prio)
700{
701}
702
703static inline void dec_prio_bias(runqueue_t *rq, int prio)
704{
705}
706
707static inline void inc_nr_running(task_t *p, runqueue_t *rq)
708{
709 rq->nr_running++;
710}
711
712static inline void dec_nr_running(task_t *p, runqueue_t *rq)
713{
714 rq->nr_running--;
715}
716#endif
717
662/* 718/*
663 * __activate_task - move a task to the runqueue. 719 * __activate_task - move a task to the runqueue.
664 */ 720 */
665static inline void __activate_task(task_t *p, runqueue_t *rq) 721static inline void __activate_task(task_t *p, runqueue_t *rq)
666{ 722{
667 enqueue_task(p, rq->active); 723 enqueue_task(p, rq->active);
668 rq->nr_running++; 724 inc_nr_running(p, rq);
669} 725}
670 726
671/* 727/*
@@ -674,7 +730,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
674static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 730static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
675{ 731{
676 enqueue_task_head(p, rq->active); 732 enqueue_task_head(p, rq->active);
677 rq->nr_running++; 733 inc_nr_running(p, rq);
678} 734}
679 735
680static int recalc_task_prio(task_t *p, unsigned long long now) 736static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -759,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
759 } 815 }
760#endif 816#endif
761 817
762 p->prio = recalc_task_prio(p, now); 818 if (!rt_task(p))
819 p->prio = recalc_task_prio(p, now);
763 820
764 /* 821 /*
765 * This checks to make sure it's not an uninterruptible task 822 * This checks to make sure it's not an uninterruptible task
@@ -793,7 +850,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
793 */ 850 */
794static void deactivate_task(struct task_struct *p, runqueue_t *rq) 851static void deactivate_task(struct task_struct *p, runqueue_t *rq)
795{ 852{
796 rq->nr_running--; 853 dec_nr_running(p, rq);
797 dequeue_task(p, p->array); 854 dequeue_task(p, p->array);
798 p->array = NULL; 855 p->array = NULL;
799} 856}
@@ -808,21 +865,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
808#ifdef CONFIG_SMP 865#ifdef CONFIG_SMP
809static void resched_task(task_t *p) 866static void resched_task(task_t *p)
810{ 867{
811 int need_resched, nrpolling; 868 int cpu;
812 869
813 assert_spin_locked(&task_rq(p)->lock); 870 assert_spin_locked(&task_rq(p)->lock);
814 871
815 /* minimise the chance of sending an interrupt to poll_idle() */ 872 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
816 nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 873 return;
817 need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); 874
818 nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); 875 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
876
877 cpu = task_cpu(p);
878 if (cpu == smp_processor_id())
879 return;
819 880
820 if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) 881 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
821 smp_send_reschedule(task_cpu(p)); 882 smp_mb();
883 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
884 smp_send_reschedule(cpu);
822} 885}
823#else 886#else
824static inline void resched_task(task_t *p) 887static inline void resched_task(task_t *p)
825{ 888{
889 assert_spin_locked(&task_rq(p)->lock);
826 set_tsk_need_resched(p); 890 set_tsk_need_resched(p);
827} 891}
828#endif 892#endif
@@ -930,27 +994,61 @@ void kick_process(task_t *p)
930 * We want to under-estimate the load of migration sources, to 994 * We want to under-estimate the load of migration sources, to
931 * balance conservatively. 995 * balance conservatively.
932 */ 996 */
933static inline unsigned long source_load(int cpu, int type) 997static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
934{ 998{
935 runqueue_t *rq = cpu_rq(cpu); 999 runqueue_t *rq = cpu_rq(cpu);
936 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1000 unsigned long running = rq->nr_running;
1001 unsigned long source_load, cpu_load = rq->cpu_load[type-1],
1002 load_now = running * SCHED_LOAD_SCALE;
1003
937 if (type == 0) 1004 if (type == 0)
938 return load_now; 1005 source_load = load_now;
1006 else
1007 source_load = min(cpu_load, load_now);
1008
1009 if (running > 1 || (idle == NOT_IDLE && running))
1010 /*
1011 * If we are busy rebalancing the load is biased by
1012 * priority to create 'nice' support across cpus. When
1013 * idle rebalancing we should only bias the source_load if
1014 * there is more than one task running on that queue to
1015 * prevent idle rebalance from trying to pull tasks from a
1016 * queue with only one running task.
1017 */
1018 source_load = source_load * rq->prio_bias / running;
1019
1020 return source_load;
1021}
939 1022
940 return min(rq->cpu_load[type-1], load_now); 1023static inline unsigned long source_load(int cpu, int type)
1024{
1025 return __source_load(cpu, type, NOT_IDLE);
941} 1026}
942 1027
943/* 1028/*
944 * Return a high guess at the load of a migration-target cpu 1029 * Return a high guess at the load of a migration-target cpu
945 */ 1030 */
946static inline unsigned long target_load(int cpu, int type) 1031static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
947{ 1032{
948 runqueue_t *rq = cpu_rq(cpu); 1033 runqueue_t *rq = cpu_rq(cpu);
949 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1034 unsigned long running = rq->nr_running;
1035 unsigned long target_load, cpu_load = rq->cpu_load[type-1],
1036 load_now = running * SCHED_LOAD_SCALE;
1037
950 if (type == 0) 1038 if (type == 0)
951 return load_now; 1039 target_load = load_now;
1040 else
1041 target_load = max(cpu_load, load_now);
1042
1043 if (running > 1 || (idle == NOT_IDLE && running))
1044 target_load = target_load * rq->prio_bias / running;
1045
1046 return target_load;
1047}
952 1048
953 return max(rq->cpu_load[type-1], load_now); 1049static inline unsigned long target_load(int cpu, int type)
1050{
1051 return __target_load(cpu, type, NOT_IDLE);
954} 1052}
955 1053
956/* 1054/*
@@ -1339,7 +1437,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1339#endif 1437#endif
1340#ifdef CONFIG_PREEMPT 1438#ifdef CONFIG_PREEMPT
1341 /* Want to start with kernel preemption disabled. */ 1439 /* Want to start with kernel preemption disabled. */
1342 p->thread_info->preempt_count = 1; 1440 task_thread_info(p)->preempt_count = 1;
1343#endif 1441#endif
1344 /* 1442 /*
1345 * Share the timeslice between parent and child, thus the 1443 * Share the timeslice between parent and child, thus the
@@ -1411,7 +1509,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1411 list_add_tail(&p->run_list, &current->run_list); 1509 list_add_tail(&p->run_list, &current->run_list);
1412 p->array = current->array; 1510 p->array = current->array;
1413 p->array->nr_active++; 1511 p->array->nr_active++;
1414 rq->nr_running++; 1512 inc_nr_running(p, rq);
1415 } 1513 }
1416 set_need_resched(); 1514 set_need_resched();
1417 } else 1515 } else
@@ -1468,7 +1566,7 @@ void fastcall sched_exit(task_t *p)
1468 * the sleep_avg of the parent as well. 1566 * the sleep_avg of the parent as well.
1469 */ 1567 */
1470 rq = task_rq_lock(p->parent, &flags); 1568 rq = task_rq_lock(p->parent, &flags);
1471 if (p->first_time_slice) { 1569 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1472 p->parent->time_slice += p->time_slice; 1570 p->parent->time_slice += p->time_slice;
1473 if (unlikely(p->parent->time_slice > task_timeslice(p))) 1571 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1474 p->parent->time_slice = task_timeslice(p); 1572 p->parent->time_slice = task_timeslice(p);
@@ -1756,9 +1854,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1756 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1854 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1757{ 1855{
1758 dequeue_task(p, src_array); 1856 dequeue_task(p, src_array);
1759 src_rq->nr_running--; 1857 dec_nr_running(p, src_rq);
1760 set_task_cpu(p, this_cpu); 1858 set_task_cpu(p, this_cpu);
1761 this_rq->nr_running++; 1859 inc_nr_running(p, this_rq);
1762 enqueue_task(p, this_array); 1860 enqueue_task(p, this_array);
1763 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1861 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1764 + this_rq->timestamp_last_tick; 1862 + this_rq->timestamp_last_tick;
@@ -1937,9 +2035,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1937 2035
1938 /* Bias balancing toward cpus of our domain */ 2036 /* Bias balancing toward cpus of our domain */
1939 if (local_group) 2037 if (local_group)
1940 load = target_load(i, load_idx); 2038 load = __target_load(i, load_idx, idle);
1941 else 2039 else
1942 load = source_load(i, load_idx); 2040 load = __source_load(i, load_idx, idle);
1943 2041
1944 avg_load += load; 2042 avg_load += load;
1945 } 2043 }
@@ -2044,14 +2142,15 @@ out_balanced:
2044/* 2142/*
2045 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2143 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2046 */ 2144 */
2047static runqueue_t *find_busiest_queue(struct sched_group *group) 2145static runqueue_t *find_busiest_queue(struct sched_group *group,
2146 enum idle_type idle)
2048{ 2147{
2049 unsigned long load, max_load = 0; 2148 unsigned long load, max_load = 0;
2050 runqueue_t *busiest = NULL; 2149 runqueue_t *busiest = NULL;
2051 int i; 2150 int i;
2052 2151
2053 for_each_cpu_mask(i, group->cpumask) { 2152 for_each_cpu_mask(i, group->cpumask) {
2054 load = source_load(i, 0); 2153 load = __source_load(i, 0, idle);
2055 2154
2056 if (load > max_load) { 2155 if (load > max_load) {
2057 max_load = load; 2156 max_load = load;
@@ -2095,7 +2194,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2095 goto out_balanced; 2194 goto out_balanced;
2096 } 2195 }
2097 2196
2098 busiest = find_busiest_queue(group); 2197 busiest = find_busiest_queue(group, idle);
2099 if (!busiest) { 2198 if (!busiest) {
2100 schedstat_inc(sd, lb_nobusyq[idle]); 2199 schedstat_inc(sd, lb_nobusyq[idle]);
2101 goto out_balanced; 2200 goto out_balanced;
@@ -2218,7 +2317,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2218 goto out_balanced; 2317 goto out_balanced;
2219 } 2318 }
2220 2319
2221 busiest = find_busiest_queue(group); 2320 busiest = find_busiest_queue(group, NEWLY_IDLE);
2222 if (!busiest) { 2321 if (!busiest) {
2223 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2322 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2224 goto out_balanced; 2323 goto out_balanced;
@@ -2511,8 +2610,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2511 cpustat->idle = cputime64_add(cpustat->idle, tmp); 2610 cpustat->idle = cputime64_add(cpustat->idle, tmp);
2512 /* Account for system time used */ 2611 /* Account for system time used */
2513 acct_update_integrals(p); 2612 acct_update_integrals(p);
2514 /* Update rss highwater mark */
2515 update_mem_hiwater(p);
2516} 2613}
2517 2614
2518/* 2615/*
@@ -3453,8 +3550,10 @@ void set_user_nice(task_t *p, long nice)
3453 goto out_unlock; 3550 goto out_unlock;
3454 } 3551 }
3455 array = p->array; 3552 array = p->array;
3456 if (array) 3553 if (array) {
3457 dequeue_task(p, array); 3554 dequeue_task(p, array);
3555 dec_prio_bias(rq, p->static_prio);
3556 }
3458 3557
3459 old_prio = p->prio; 3558 old_prio = p->prio;
3460 new_prio = NICE_TO_PRIO(nice); 3559 new_prio = NICE_TO_PRIO(nice);
@@ -3464,6 +3563,7 @@ void set_user_nice(task_t *p, long nice)
3464 3563
3465 if (array) { 3564 if (array) {
3466 enqueue_task(p, array); 3565 enqueue_task(p, array);
3566 inc_prio_bias(rq, p->static_prio);
3467 /* 3567 /*
3468 * If the task increased its priority or is running and 3568 * If the task increased its priority or is running and
3469 * lowered its priority, then reschedule its CPU: 3569 * lowered its priority, then reschedule its CPU:
@@ -3565,8 +3665,6 @@ int idle_cpu(int cpu)
3565 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 3665 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3566} 3666}
3567 3667
3568EXPORT_SYMBOL_GPL(idle_cpu);
3569
3570/** 3668/**
3571 * idle_task - return the idle task for a given cpu. 3669 * idle_task - return the idle task for a given cpu.
3572 * @cpu: the processor in question. 3670 * @cpu: the processor in question.
@@ -4229,10 +4327,10 @@ static void show_task(task_t *p)
4229#endif 4327#endif
4230#ifdef CONFIG_DEBUG_STACK_USAGE 4328#ifdef CONFIG_DEBUG_STACK_USAGE
4231 { 4329 {
4232 unsigned long *n = (unsigned long *) (p->thread_info+1); 4330 unsigned long *n = end_of_stack(p);
4233 while (!*n) 4331 while (!*n)
4234 n++; 4332 n++;
4235 free = (unsigned long) n - (unsigned long)(p->thread_info+1); 4333 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4236 } 4334 }
4237#endif 4335#endif
4238 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); 4336 printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
@@ -4312,9 +4410,9 @@ void __devinit init_idle(task_t *idle, int cpu)
4312 4410
4313 /* Set the preempt count _outside_ the spinlocks! */ 4411 /* Set the preempt count _outside_ the spinlocks! */
4314#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4412#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4315 idle->thread_info->preempt_count = (idle->lock_depth >= 0); 4413 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4316#else 4414#else
4317 idle->thread_info->preempt_count = 0; 4415 task_thread_info(idle)->preempt_count = 0;
4318#endif 4416#endif
4319} 4417}
4320 4418
@@ -4682,7 +4780,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4682#ifdef CONFIG_HOTPLUG_CPU 4780#ifdef CONFIG_HOTPLUG_CPU
4683 case CPU_UP_CANCELED: 4781 case CPU_UP_CANCELED:
4684 /* Unbind it from offline cpu so it can run. Fall thru. */ 4782 /* Unbind it from offline cpu so it can run. Fall thru. */
4685 kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); 4783 kthread_bind(cpu_rq(cpu)->migration_thread,
4784 any_online_cpu(cpu_online_map));
4686 kthread_stop(cpu_rq(cpu)->migration_thread); 4785 kthread_stop(cpu_rq(cpu)->migration_thread);
4687 cpu_rq(cpu)->migration_thread = NULL; 4786 cpu_rq(cpu)->migration_thread = NULL;
4688 break; 4787 break;
diff --git a/kernel/signal.c b/kernel/signal.c
index b92c3c9f8b9a..d7611f189ef7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
262 return sig; 262 return sig;
263} 263}
264 264
265static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags, 265static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
266 int override_rlimit) 266 int override_rlimit)
267{ 267{
268 struct sigqueue *q = NULL; 268 struct sigqueue *q = NULL;
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __n
277 } else { 277 } else {
278 INIT_LIST_HEAD(&q->list); 278 INIT_LIST_HEAD(&q->list);
279 q->flags = 0; 279 q->flags = 0;
280 q->lock = NULL;
281 q->user = get_uid(t->user); 280 q->user = get_uid(t->user);
282 } 281 }
283 return(q); 282 return(q);
@@ -397,20 +396,8 @@ void __exit_signal(struct task_struct *tsk)
397 flush_sigqueue(&tsk->pending); 396 flush_sigqueue(&tsk->pending);
398 if (sig) { 397 if (sig) {
399 /* 398 /*
400 * We are cleaning up the signal_struct here. We delayed 399 * We are cleaning up the signal_struct here.
401 * calling exit_itimers until after flush_sigqueue, just in
402 * case our thread-local pending queue contained a queued
403 * timer signal that would have been cleared in
404 * exit_itimers. When that called sigqueue_free, it would
405 * attempt to re-take the tasklist_lock and deadlock. This
406 * can never happen if we ensure that all queues the
407 * timer's signal might be queued on have been flushed
408 * first. The shared_pending queue, and our own pending
409 * queue are the only queues the timer could be on, since
410 * there are no other threads left in the group and timer
411 * signals are constrained to threads inside the group.
412 */ 400 */
413 exit_itimers(sig);
414 exit_thread_group_keys(sig); 401 exit_thread_group_keys(sig);
415 kmem_cache_free(signal_cachep, sig); 402 kmem_cache_free(signal_cachep, sig);
416 } 403 }
@@ -418,6 +405,8 @@ void __exit_signal(struct task_struct *tsk)
418 405
419void exit_signal(struct task_struct *tsk) 406void exit_signal(struct task_struct *tsk)
420{ 407{
408 atomic_dec(&tsk->signal->live);
409
421 write_lock_irq(&tasklist_lock); 410 write_lock_irq(&tasklist_lock);
422 __exit_signal(tsk); 411 __exit_signal(tsk);
423 write_unlock_irq(&tasklist_lock); 412 write_unlock_irq(&tasklist_lock);
@@ -524,16 +513,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
524{ 513{
525 int sig = 0; 514 int sig = 0;
526 515
527 /* SIGKILL must have priority, otherwise it is quite easy 516 sig = next_signal(pending, mask);
528 * to create an unkillable process, sending sig < SIGKILL
529 * to self */
530 if (unlikely(sigismember(&pending->signal, SIGKILL))) {
531 if (!sigismember(mask, SIGKILL))
532 sig = SIGKILL;
533 }
534
535 if (likely(!sig))
536 sig = next_signal(pending, mask);
537 if (sig) { 517 if (sig) {
538 if (current->notifier) { 518 if (current->notifier) {
539 if (sigismember(current->notifier_mask, sig)) { 519 if (sigismember(current->notifier_mask, sig)) {
@@ -578,7 +558,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
578 * is to alert stop-signal processing code when another 558 * is to alert stop-signal processing code when another
579 * processor has come along and cleared the flag. 559 * processor has come along and cleared the flag.
580 */ 560 */
581 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; 561 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
562 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
582 } 563 }
583 if ( signr && 564 if ( signr &&
584 ((info->si_code & __SI_MASK) == __SI_TIMER) && 565 ((info->si_code & __SI_MASK) == __SI_TIMER) &&
@@ -661,8 +642,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
661 if (!valid_signal(sig)) 642 if (!valid_signal(sig))
662 return error; 643 return error;
663 error = -EPERM; 644 error = -EPERM;
664 if ((!info || ((unsigned long)info != 1 && 645 if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
665 (unsigned long)info != 2 && SI_FROMUSER(info)))
666 && ((sig != SIGCONT) || 646 && ((sig != SIGCONT) ||
667 (current->signal->session != t->signal->session)) 647 (current->signal->session != t->signal->session))
668 && (current->euid ^ t->suid) && (current->euid ^ t->uid) 648 && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -799,7 +779,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
799 * fast-pathed signals for kernel-internal things like SIGSTOP 779 * fast-pathed signals for kernel-internal things like SIGSTOP
800 * or SIGKILL. 780 * or SIGKILL.
801 */ 781 */
802 if ((unsigned long)info == 2) 782 if (info == SEND_SIG_FORCED)
803 goto out_set; 783 goto out_set;
804 784
805 /* Real-time signals must be queued if sent by sigqueue, or 785 /* Real-time signals must be queued if sent by sigqueue, or
@@ -811,19 +791,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
811 pass on the info struct. */ 791 pass on the info struct. */
812 792
813 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && 793 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
814 ((unsigned long) info < 2 || 794 (is_si_special(info) ||
815 info->si_code >= 0))); 795 info->si_code >= 0)));
816 if (q) { 796 if (q) {
817 list_add_tail(&q->list, &signals->list); 797 list_add_tail(&q->list, &signals->list);
818 switch ((unsigned long) info) { 798 switch ((unsigned long) info) {
819 case 0: 799 case (unsigned long) SEND_SIG_NOINFO:
820 q->info.si_signo = sig; 800 q->info.si_signo = sig;
821 q->info.si_errno = 0; 801 q->info.si_errno = 0;
822 q->info.si_code = SI_USER; 802 q->info.si_code = SI_USER;
823 q->info.si_pid = current->pid; 803 q->info.si_pid = current->pid;
824 q->info.si_uid = current->uid; 804 q->info.si_uid = current->uid;
825 break; 805 break;
826 case 1: 806 case (unsigned long) SEND_SIG_PRIV:
827 q->info.si_signo = sig; 807 q->info.si_signo = sig;
828 q->info.si_errno = 0; 808 q->info.si_errno = 0;
829 q->info.si_code = SI_KERNEL; 809 q->info.si_code = SI_KERNEL;
@@ -834,20 +814,13 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
834 copy_siginfo(&q->info, info); 814 copy_siginfo(&q->info, info);
835 break; 815 break;
836 } 816 }
837 } else { 817 } else if (!is_si_special(info)) {
838 if (sig >= SIGRTMIN && info && (unsigned long)info != 1 818 if (sig >= SIGRTMIN && info->si_code != SI_USER)
839 && info->si_code != SI_USER)
840 /* 819 /*
841 * Queue overflow, abort. We may abort if the signal was rt 820 * Queue overflow, abort. We may abort if the signal was rt
842 * and sent by user using something other than kill(). 821 * and sent by user using something other than kill().
843 */ 822 */
844 return -EAGAIN; 823 return -EAGAIN;
845 if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
846 /*
847 * Set up a return to indicate that we dropped
848 * the signal.
849 */
850 ret = info->si_sys_private;
851 } 824 }
852 825
853out_set: 826out_set:
@@ -868,12 +841,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
868 BUG(); 841 BUG();
869 assert_spin_locked(&t->sighand->siglock); 842 assert_spin_locked(&t->sighand->siglock);
870 843
871 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
872 /*
873 * Set up a return to indicate that we dropped the signal.
874 */
875 ret = info->si_sys_private;
876
877 /* Short-circuit ignored signals. */ 844 /* Short-circuit ignored signals. */
878 if (sig_ignored(t, sig)) 845 if (sig_ignored(t, sig))
879 goto out; 846 goto out;
@@ -903,11 +870,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
903 int ret; 870 int ret;
904 871
905 spin_lock_irqsave(&t->sighand->siglock, flags); 872 spin_lock_irqsave(&t->sighand->siglock, flags);
906 if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { 873 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
907 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; 874 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
875 }
876 if (sigismember(&t->blocked, sig)) {
908 sigdelset(&t->blocked, sig); 877 sigdelset(&t->blocked, sig);
909 recalc_sigpending_tsk(t);
910 } 878 }
879 recalc_sigpending_tsk(t);
911 ret = specific_send_sig_info(sig, info, t); 880 ret = specific_send_sig_info(sig, info, t);
912 spin_unlock_irqrestore(&t->sighand->siglock, flags); 881 spin_unlock_irqrestore(&t->sighand->siglock, flags);
913 882
@@ -917,15 +886,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
917void 886void
918force_sig_specific(int sig, struct task_struct *t) 887force_sig_specific(int sig, struct task_struct *t)
919{ 888{
920 unsigned long int flags; 889 force_sig_info(sig, SEND_SIG_FORCED, t);
921
922 spin_lock_irqsave(&t->sighand->siglock, flags);
923 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
924 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
925 sigdelset(&t->blocked, sig);
926 recalc_sigpending_tsk(t);
927 specific_send_sig_info(sig, (void *)2, t);
928 spin_unlock_irqrestore(&t->sighand->siglock, flags);
929} 890}
930 891
931/* 892/*
@@ -936,34 +897,31 @@ force_sig_specific(int sig, struct task_struct *t)
936 * as soon as they're available, so putting the signal on the shared queue 897 * as soon as they're available, so putting the signal on the shared queue
937 * will be equivalent to sending it to one such thread. 898 * will be equivalent to sending it to one such thread.
938 */ 899 */
939#define wants_signal(sig, p, mask) \ 900static inline int wants_signal(int sig, struct task_struct *p)
940 (!sigismember(&(p)->blocked, sig) \ 901{
941 && !((p)->state & mask) \ 902 if (sigismember(&p->blocked, sig))
942 && !((p)->flags & PF_EXITING) \ 903 return 0;
943 && (task_curr(p) || !signal_pending(p))) 904 if (p->flags & PF_EXITING)
944 905 return 0;
906 if (sig == SIGKILL)
907 return 1;
908 if (p->state & (TASK_STOPPED | TASK_TRACED))
909 return 0;
910 return task_curr(p) || !signal_pending(p);
911}
945 912
946static void 913static void
947__group_complete_signal(int sig, struct task_struct *p) 914__group_complete_signal(int sig, struct task_struct *p)
948{ 915{
949 unsigned int mask;
950 struct task_struct *t; 916 struct task_struct *t;
951 917
952 /* 918 /*
953 * Don't bother traced and stopped tasks (but
954 * SIGKILL will punch through that).
955 */
956 mask = TASK_STOPPED | TASK_TRACED;
957 if (sig == SIGKILL)
958 mask = 0;
959
960 /*
961 * Now find a thread we can wake up to take the signal off the queue. 919 * Now find a thread we can wake up to take the signal off the queue.
962 * 920 *
963 * If the main thread wants the signal, it gets first crack. 921 * If the main thread wants the signal, it gets first crack.
964 * Probably the least surprising to the average bear. 922 * Probably the least surprising to the average bear.
965 */ 923 */
966 if (wants_signal(sig, p, mask)) 924 if (wants_signal(sig, p))
967 t = p; 925 t = p;
968 else if (thread_group_empty(p)) 926 else if (thread_group_empty(p))
969 /* 927 /*
@@ -981,7 +939,7 @@ __group_complete_signal(int sig, struct task_struct *p)
981 t = p->signal->curr_target = p; 939 t = p->signal->curr_target = p;
982 BUG_ON(t->tgid != p->tgid); 940 BUG_ON(t->tgid != p->tgid);
983 941
984 while (!wants_signal(sig, t, mask)) { 942 while (!wants_signal(sig, t)) {
985 t = next_thread(t); 943 t = next_thread(t);
986 if (t == p->signal->curr_target) 944 if (t == p->signal->curr_target)
987 /* 945 /*
@@ -1063,12 +1021,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1063 assert_spin_locked(&p->sighand->siglock); 1021 assert_spin_locked(&p->sighand->siglock);
1064 handle_stop_signal(sig, p); 1022 handle_stop_signal(sig, p);
1065 1023
1066 if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
1067 /*
1068 * Set up a return to indicate that we dropped the signal.
1069 */
1070 ret = info->si_sys_private;
1071
1072 /* Short-circuit ignored signals. */ 1024 /* Short-circuit ignored signals. */
1073 if (sig_ignored(p, sig)) 1025 if (sig_ignored(p, sig))
1074 return ret; 1026 return ret;
@@ -1121,8 +1073,8 @@ void zap_other_threads(struct task_struct *p)
1121 if (t != p->group_leader) 1073 if (t != p->group_leader)
1122 t->exit_signal = -1; 1074 t->exit_signal = -1;
1123 1075
1076 /* SIGKILL will be handled before any pending SIGSTOP */
1124 sigaddset(&t->pending.signal, SIGKILL); 1077 sigaddset(&t->pending.signal, SIGKILL);
1125 rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
1126 signal_wake_up(t, 1); 1078 signal_wake_up(t, 1);
1127 } 1079 }
1128} 1080}
@@ -1195,6 +1147,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1195 return error; 1147 return error;
1196} 1148}
1197 1149
1150/* like kill_proc_info(), but doesn't use uid/euid of "current" */
1151int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1152 uid_t uid, uid_t euid)
1153{
1154 int ret = -EINVAL;
1155 struct task_struct *p;
1156
1157 if (!valid_signal(sig))
1158 return ret;
1159
1160 read_lock(&tasklist_lock);
1161 p = find_task_by_pid(pid);
1162 if (!p) {
1163 ret = -ESRCH;
1164 goto out_unlock;
1165 }
1166 if ((!info || ((unsigned long)info != 1 &&
1167 (unsigned long)info != 2 && SI_FROMUSER(info)))
1168 && (euid != p->suid) && (euid != p->uid)
1169 && (uid != p->suid) && (uid != p->uid)) {
1170 ret = -EPERM;
1171 goto out_unlock;
1172 }
1173 if (sig && p->sighand) {
1174 unsigned long flags;
1175 spin_lock_irqsave(&p->sighand->siglock, flags);
1176 ret = __group_send_sig_info(sig, info, p);
1177 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1178 }
1179out_unlock:
1180 read_unlock(&tasklist_lock);
1181 return ret;
1182}
1183EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
1198 1184
1199/* 1185/*
1200 * kill_something_info() interprets pid in interesting ways just like kill(2). 1186 * kill_something_info() interprets pid in interesting ways just like kill(2).
@@ -1264,10 +1250,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1264 return ret; 1250 return ret;
1265} 1251}
1266 1252
1253#define __si_special(priv) \
1254 ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
1255
1267int 1256int
1268send_sig(int sig, struct task_struct *p, int priv) 1257send_sig(int sig, struct task_struct *p, int priv)
1269{ 1258{
1270 return send_sig_info(sig, (void*)(long)(priv != 0), p); 1259 return send_sig_info(sig, __si_special(priv), p);
1271} 1260}
1272 1261
1273/* 1262/*
@@ -1287,7 +1276,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1287void 1276void
1288force_sig(int sig, struct task_struct *p) 1277force_sig(int sig, struct task_struct *p)
1289{ 1278{
1290 force_sig_info(sig, (void*)1L, p); 1279 force_sig_info(sig, SEND_SIG_PRIV, p);
1291} 1280}
1292 1281
1293/* 1282/*
@@ -1312,13 +1301,13 @@ force_sigsegv(int sig, struct task_struct *p)
1312int 1301int
1313kill_pg(pid_t pgrp, int sig, int priv) 1302kill_pg(pid_t pgrp, int sig, int priv)
1314{ 1303{
1315 return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp); 1304 return kill_pg_info(sig, __si_special(priv), pgrp);
1316} 1305}
1317 1306
1318int 1307int
1319kill_proc(pid_t pid, int sig, int priv) 1308kill_proc(pid_t pid, int sig, int priv)
1320{ 1309{
1321 return kill_proc_info(sig, (void *)(long)(priv != 0), pid); 1310 return kill_proc_info(sig, __si_special(priv), pid);
1322} 1311}
1323 1312
1324/* 1313/*
@@ -1349,11 +1338,12 @@ void sigqueue_free(struct sigqueue *q)
1349 * pending queue. 1338 * pending queue.
1350 */ 1339 */
1351 if (unlikely(!list_empty(&q->list))) { 1340 if (unlikely(!list_empty(&q->list))) {
1352 read_lock(&tasklist_lock); 1341 spinlock_t *lock = &current->sighand->siglock;
1353 spin_lock_irqsave(q->lock, flags); 1342 read_lock(&tasklist_lock);
1343 spin_lock_irqsave(lock, flags);
1354 if (!list_empty(&q->list)) 1344 if (!list_empty(&q->list))
1355 list_del_init(&q->list); 1345 list_del_init(&q->list);
1356 spin_unlock_irqrestore(q->lock, flags); 1346 spin_unlock_irqrestore(lock, flags);
1357 read_unlock(&tasklist_lock); 1347 read_unlock(&tasklist_lock);
1358 } 1348 }
1359 q->flags &= ~SIGQUEUE_PREALLOC; 1349 q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1392,7 +1382,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1392 goto out; 1382 goto out;
1393 } 1383 }
1394 1384
1395 q->lock = &p->sighand->siglock;
1396 list_add_tail(&q->list, &p->pending.list); 1385 list_add_tail(&q->list, &p->pending.list);
1397 sigaddset(&p->pending.signal, sig); 1386 sigaddset(&p->pending.signal, sig);
1398 if (!sigismember(&p->blocked, sig)) 1387 if (!sigismember(&p->blocked, sig))
@@ -1440,7 +1429,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1440 * We always use the shared queue for process-wide signals, 1429 * We always use the shared queue for process-wide signals,
1441 * to avoid several races. 1430 * to avoid several races.
1442 */ 1431 */
1443 q->lock = &p->sighand->siglock;
1444 list_add_tail(&q->list, &p->signal->shared_pending.list); 1432 list_add_tail(&q->list, &p->signal->shared_pending.list);
1445 sigaddset(&p->signal->shared_pending.signal, sig); 1433 sigaddset(&p->signal->shared_pending.signal, sig);
1446 1434
@@ -1502,7 +1490,7 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1502 1490
1503 psig = tsk->parent->sighand; 1491 psig = tsk->parent->sighand;
1504 spin_lock_irqsave(&psig->siglock, flags); 1492 spin_lock_irqsave(&psig->siglock, flags);
1505 if (sig == SIGCHLD && 1493 if (!tsk->ptrace && sig == SIGCHLD &&
1506 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || 1494 (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
1507 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { 1495 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
1508 /* 1496 /*
@@ -1766,7 +1754,8 @@ do_signal_stop(int signr)
1766 * stop is always done with the siglock held, 1754 * stop is always done with the siglock held,
1767 * so this check has no races. 1755 * so this check has no races.
1768 */ 1756 */
1769 if (t->state < TASK_STOPPED) { 1757 if (!t->exit_state &&
1758 !(t->state & (TASK_STOPPED|TASK_TRACED))) {
1770 stop_count++; 1759 stop_count++;
1771 signal_wake_up(t, 0); 1760 signal_wake_up(t, 0);
1772 } 1761 }
@@ -1858,9 +1847,9 @@ relock:
1858 /* Let the debugger run. */ 1847 /* Let the debugger run. */
1859 ptrace_stop(signr, signr, info); 1848 ptrace_stop(signr, signr, info);
1860 1849
1861 /* We're back. Did the debugger cancel the sig? */ 1850 /* We're back. Did the debugger cancel the sig or group_exit? */
1862 signr = current->exit_code; 1851 signr = current->exit_code;
1863 if (signr == 0) 1852 if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
1864 continue; 1853 continue;
1865 1854
1866 current->exit_code = 0; 1855 current->exit_code = 0;
@@ -2262,26 +2251,13 @@ sys_kill(int pid, int sig)
2262 return kill_something_info(sig, &info, pid); 2251 return kill_something_info(sig, &info, pid);
2263} 2252}
2264 2253
2265/** 2254static int do_tkill(int tgid, int pid, int sig)
2266 * sys_tgkill - send signal to one specific thread
2267 * @tgid: the thread group ID of the thread
2268 * @pid: the PID of the thread
2269 * @sig: signal to be sent
2270 *
2271 * This syscall also checks the tgid and returns -ESRCH even if the PID
2272 * exists but it's not belonging to the target process anymore. This
2273 * method solves the problem of threads exiting and PIDs getting reused.
2274 */
2275asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2276{ 2255{
2277 struct siginfo info;
2278 int error; 2256 int error;
2257 struct siginfo info;
2279 struct task_struct *p; 2258 struct task_struct *p;
2280 2259
2281 /* This is only valid for single tasks */ 2260 error = -ESRCH;
2282 if (pid <= 0 || tgid <= 0)
2283 return -EINVAL;
2284
2285 info.si_signo = sig; 2261 info.si_signo = sig;
2286 info.si_errno = 0; 2262 info.si_errno = 0;
2287 info.si_code = SI_TKILL; 2263 info.si_code = SI_TKILL;
@@ -2290,8 +2266,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2290 2266
2291 read_lock(&tasklist_lock); 2267 read_lock(&tasklist_lock);
2292 p = find_task_by_pid(pid); 2268 p = find_task_by_pid(pid);
2293 error = -ESRCH; 2269 if (p && (tgid <= 0 || p->tgid == tgid)) {
2294 if (p && (p->tgid == tgid)) {
2295 error = check_kill_permission(sig, &info, p); 2270 error = check_kill_permission(sig, &info, p);
2296 /* 2271 /*
2297 * The null signal is a permissions and process existence 2272 * The null signal is a permissions and process existence
@@ -2305,47 +2280,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2305 } 2280 }
2306 } 2281 }
2307 read_unlock(&tasklist_lock); 2282 read_unlock(&tasklist_lock);
2283
2308 return error; 2284 return error;
2309} 2285}
2310 2286
2287/**
2288 * sys_tgkill - send signal to one specific thread
2289 * @tgid: the thread group ID of the thread
2290 * @pid: the PID of the thread
2291 * @sig: signal to be sent
2292 *
2293 * This syscall also checks the tgid and returns -ESRCH even if the PID
2294 * exists but it's not belonging to the target process anymore. This
2295 * method solves the problem of threads exiting and PIDs getting reused.
2296 */
2297asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2298{
2299 /* This is only valid for single tasks */
2300 if (pid <= 0 || tgid <= 0)
2301 return -EINVAL;
2302
2303 return do_tkill(tgid, pid, sig);
2304}
2305
2311/* 2306/*
2312 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2307 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2313 */ 2308 */
2314asmlinkage long 2309asmlinkage long
2315sys_tkill(int pid, int sig) 2310sys_tkill(int pid, int sig)
2316{ 2311{
2317 struct siginfo info;
2318 int error;
2319 struct task_struct *p;
2320
2321 /* This is only valid for single tasks */ 2312 /* This is only valid for single tasks */
2322 if (pid <= 0) 2313 if (pid <= 0)
2323 return -EINVAL; 2314 return -EINVAL;
2324 2315
2325 info.si_signo = sig; 2316 return do_tkill(0, pid, sig);
2326 info.si_errno = 0;
2327 info.si_code = SI_TKILL;
2328 info.si_pid = current->tgid;
2329 info.si_uid = current->uid;
2330
2331 read_lock(&tasklist_lock);
2332 p = find_task_by_pid(pid);
2333 error = -ESRCH;
2334 if (p) {
2335 error = check_kill_permission(sig, &info, p);
2336 /*
2337 * The null signal is a permissions and process existence
2338 * probe. No signal is actually delivered.
2339 */
2340 if (!error && sig && p->sighand) {
2341 spin_lock_irq(&p->sighand->siglock);
2342 handle_stop_signal(sig, p);
2343 error = specific_send_sig_info(sig, &info, p);
2344 spin_unlock_irq(&p->sighand->siglock);
2345 }
2346 }
2347 read_unlock(&tasklist_lock);
2348 return error;
2349} 2317}
2350 2318
2351asmlinkage long 2319asmlinkage long
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f766b2fc48be..ad3295cdded5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
470#ifdef CONFIG_HOTPLUG_CPU 470#ifdef CONFIG_HOTPLUG_CPU
471 case CPU_UP_CANCELED: 471 case CPU_UP_CANCELED:
472 /* Unbind so it can run. Fall thru. */ 472 /* Unbind so it can run. Fall thru. */
473 kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id()); 473 kthread_bind(per_cpu(ksoftirqd, hotcpu),
474 any_online_cpu(cpu_online_map));
474 case CPU_DEAD: 475 case CPU_DEAD:
475 p = per_cpu(ksoftirqd, hotcpu); 476 p = per_cpu(ksoftirqd, hotcpu);
476 per_cpu(ksoftirqd, hotcpu) = NULL; 477 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 75976209cea7..c67189a25d52 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs)
73static int watchdog(void * __bind_cpu) 73static int watchdog(void * __bind_cpu)
74{ 74{
75 struct sched_param param = { .sched_priority = 99 }; 75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79 76
80 sched_setscheduler(current, SCHED_FIFO, &param); 77 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE; 78 current->flags |= PF_NOFREEZE;
@@ -123,7 +120,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
123#ifdef CONFIG_HOTPLUG_CPU 120#ifdef CONFIG_HOTPLUG_CPU
124 case CPU_UP_CANCELED: 121 case CPU_UP_CANCELED:
125 /* Unbind so it can run. Fall thru. */ 122 /* Unbind so it can run. Fall thru. */
126 kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); 123 kthread_bind(per_cpu(watchdog_task, hotcpu),
124 any_online_cpu(cpu_online_map));
127 case CPU_DEAD: 125 case CPU_DEAD:
128 p = per_cpu(watchdog_task, hotcpu); 126 p = per_cpu(watchdog_task, hotcpu);
129 per_cpu(watchdog_task, hotcpu) = NULL; 127 per_cpu(watchdog_task, hotcpu) = NULL;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84a9d18aa8da..b3d4dc858e35 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -119,13 +119,12 @@ static int stop_machine(void)
119 return ret; 119 return ret;
120 } 120 }
121 121
122 /* Don't schedule us away at this point, please. */
123 local_irq_disable();
124
125 /* Now they are all started, make them hold the CPUs, ready. */ 122 /* Now they are all started, make them hold the CPUs, ready. */
123 preempt_disable();
126 stopmachine_set_state(STOPMACHINE_PREPARE); 124 stopmachine_set_state(STOPMACHINE_PREPARE);
127 125
128 /* Make them disable irqs. */ 126 /* Make them disable irqs. */
127 local_irq_disable();
129 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); 128 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
130 129
131 return 0; 130 return 0;
@@ -135,6 +134,7 @@ static void restart_machine(void)
135{ 134{
136 stopmachine_set_state(STOPMACHINE_EXIT); 135 stopmachine_set_state(STOPMACHINE_EXIT);
137 local_irq_enable(); 136 local_irq_enable();
137 preempt_enable_no_resched();
138} 138}
139 139
140struct stop_machine_data 140struct stop_machine_data
diff --git a/kernel/sys.c b/kernel/sys.c
index f723522e6986..bce933ebb29f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
28#include <linux/suspend.h> 28#include <linux/suspend.h>
29#include <linux/tty.h> 29#include <linux/tty.h>
30#include <linux/signal.h> 30#include <linux/signal.h>
31#include <linux/cn_proc.h>
31 32
32#include <linux/compat.h> 33#include <linux/compat.h>
33#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -361,17 +362,38 @@ out_unlock:
361 return retval; 362 return retval;
362} 363}
363 364
365/**
366 * emergency_restart - reboot the system
367 *
368 * Without shutting down any hardware or taking any locks
369 * reboot the system. This is called when we know we are in
370 * trouble so this is our best effort to reboot. This is
371 * safe to call in interrupt context.
372 */
364void emergency_restart(void) 373void emergency_restart(void)
365{ 374{
366 machine_emergency_restart(); 375 machine_emergency_restart();
367} 376}
368EXPORT_SYMBOL_GPL(emergency_restart); 377EXPORT_SYMBOL_GPL(emergency_restart);
369 378
370void kernel_restart(char *cmd) 379void kernel_restart_prepare(char *cmd)
371{ 380{
372 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 381 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
373 system_state = SYSTEM_RESTART; 382 system_state = SYSTEM_RESTART;
374 device_shutdown(); 383 device_shutdown();
384}
385
386/**
387 * kernel_restart - reboot the system
388 * @cmd: pointer to buffer containing command to execute for restart
389 * or %NULL
390 *
391 * Shutdown everything and perform a clean reboot.
392 * This is not safe to call in interrupt context.
393 */
394void kernel_restart(char *cmd)
395{
396 kernel_restart_prepare(cmd);
375 if (!cmd) { 397 if (!cmd) {
376 printk(KERN_EMERG "Restarting system.\n"); 398 printk(KERN_EMERG "Restarting system.\n");
377 } else { 399 } else {
@@ -382,6 +404,12 @@ void kernel_restart(char *cmd)
382} 404}
383EXPORT_SYMBOL_GPL(kernel_restart); 405EXPORT_SYMBOL_GPL(kernel_restart);
384 406
407/**
408 * kernel_kexec - reboot the system
409 *
410 * Move into place and start executing a preloaded standalone
411 * executable. If nothing was preloaded return an error.
412 */
385void kernel_kexec(void) 413void kernel_kexec(void)
386{ 414{
387#ifdef CONFIG_KEXEC 415#ifdef CONFIG_KEXEC
@@ -390,9 +418,7 @@ void kernel_kexec(void)
390 if (!image) { 418 if (!image) {
391 return; 419 return;
392 } 420 }
393 notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); 421 kernel_restart_prepare(NULL);
394 system_state = SYSTEM_RESTART;
395 device_shutdown();
396 printk(KERN_EMERG "Starting new kernel\n"); 422 printk(KERN_EMERG "Starting new kernel\n");
397 machine_shutdown(); 423 machine_shutdown();
398 machine_kexec(image); 424 machine_kexec(image);
@@ -400,21 +426,39 @@ void kernel_kexec(void)
400} 426}
401EXPORT_SYMBOL_GPL(kernel_kexec); 427EXPORT_SYMBOL_GPL(kernel_kexec);
402 428
403void kernel_halt(void) 429/**
430 * kernel_halt - halt the system
431 *
432 * Shutdown everything and perform a clean system halt.
433 */
434void kernel_halt_prepare(void)
404{ 435{
405 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); 436 notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
406 system_state = SYSTEM_HALT; 437 system_state = SYSTEM_HALT;
407 device_shutdown(); 438 device_shutdown();
439}
440void kernel_halt(void)
441{
442 kernel_halt_prepare();
408 printk(KERN_EMERG "System halted.\n"); 443 printk(KERN_EMERG "System halted.\n");
409 machine_halt(); 444 machine_halt();
410} 445}
411EXPORT_SYMBOL_GPL(kernel_halt); 446EXPORT_SYMBOL_GPL(kernel_halt);
412 447
413void kernel_power_off(void) 448/**
449 * kernel_power_off - power_off the system
450 *
451 * Shutdown everything and perform a clean system power_off.
452 */
453void kernel_power_off_prepare(void)
414{ 454{
415 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); 455 notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
416 system_state = SYSTEM_POWER_OFF; 456 system_state = SYSTEM_POWER_OFF;
417 device_shutdown(); 457 device_shutdown();
458}
459void kernel_power_off(void)
460{
461 kernel_power_off_prepare();
418 printk(KERN_EMERG "Power down.\n"); 462 printk(KERN_EMERG "Power down.\n");
419 machine_power_off(); 463 machine_power_off();
420} 464}
@@ -583,6 +627,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
583 current->egid = new_egid; 627 current->egid = new_egid;
584 current->gid = new_rgid; 628 current->gid = new_rgid;
585 key_fsgid_changed(current); 629 key_fsgid_changed(current);
630 proc_id_connector(current, PROC_EVENT_GID);
586 return 0; 631 return 0;
587} 632}
588 633
@@ -622,6 +667,7 @@ asmlinkage long sys_setgid(gid_t gid)
622 return -EPERM; 667 return -EPERM;
623 668
624 key_fsgid_changed(current); 669 key_fsgid_changed(current);
670 proc_id_connector(current, PROC_EVENT_GID);
625 return 0; 671 return 0;
626} 672}
627 673
@@ -711,6 +757,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
711 current->fsuid = current->euid; 757 current->fsuid = current->euid;
712 758
713 key_fsuid_changed(current); 759 key_fsuid_changed(current);
760 proc_id_connector(current, PROC_EVENT_UID);
714 761
715 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE); 762 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
716} 763}
@@ -758,6 +805,7 @@ asmlinkage long sys_setuid(uid_t uid)
758 current->suid = new_suid; 805 current->suid = new_suid;
759 806
760 key_fsuid_changed(current); 807 key_fsuid_changed(current);
808 proc_id_connector(current, PROC_EVENT_UID);
761 809
762 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID); 810 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
763} 811}
@@ -806,6 +854,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
806 current->suid = suid; 854 current->suid = suid;
807 855
808 key_fsuid_changed(current); 856 key_fsuid_changed(current);
857 proc_id_connector(current, PROC_EVENT_UID);
809 858
810 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES); 859 return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
811} 860}
@@ -858,6 +907,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
858 current->sgid = sgid; 907 current->sgid = sgid;
859 908
860 key_fsgid_changed(current); 909 key_fsgid_changed(current);
910 proc_id_connector(current, PROC_EVENT_GID);
861 return 0; 911 return 0;
862} 912}
863 913
@@ -900,6 +950,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
900 } 950 }
901 951
902 key_fsuid_changed(current); 952 key_fsuid_changed(current);
953 proc_id_connector(current, PROC_EVENT_UID);
903 954
904 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS); 955 security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
905 956
@@ -928,6 +979,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
928 } 979 }
929 current->fsgid = gid; 980 current->fsgid = gid;
930 key_fsgid_changed(current); 981 key_fsgid_changed(current);
982 proc_id_connector(current, PROC_EVENT_GID);
931 } 983 }
932 return old_fsgid; 984 return old_fsgid;
933} 985}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e56e2495542..9990e10192e8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
169 169
170extern struct proc_dir_entry *proc_sys_root; 170extern struct proc_dir_entry *proc_sys_root;
171 171
172static void register_proc_table(ctl_table *, struct proc_dir_entry *); 172static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
173static void unregister_proc_table(ctl_table *, struct proc_dir_entry *); 173static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
174#endif 174#endif
175 175
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = {
952 .data = &aio_nr, 952 .data = &aio_nr,
953 .maxlen = sizeof(aio_nr), 953 .maxlen = sizeof(aio_nr),
954 .mode = 0444, 954 .mode = 0444,
955 .proc_handler = &proc_dointvec, 955 .proc_handler = &proc_doulongvec_minmax,
956 }, 956 },
957 { 957 {
958 .ctl_name = FS_AIO_MAX_NR, 958 .ctl_name = FS_AIO_MAX_NR,
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = {
960 .data = &aio_max_nr, 960 .data = &aio_max_nr,
961 .maxlen = sizeof(aio_max_nr), 961 .maxlen = sizeof(aio_max_nr),
962 .mode = 0644, 962 .mode = 0644,
963 .proc_handler = &proc_dointvec, 963 .proc_handler = &proc_doulongvec_minmax,
964 }, 964 },
965#ifdef CONFIG_INOTIFY 965#ifdef CONFIG_INOTIFY
966 { 966 {
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
992 992
993extern void init_irq_proc (void); 993extern void init_irq_proc (void);
994 994
995static DEFINE_SPINLOCK(sysctl_lock);
996
997/* called under sysctl_lock */
998static int use_table(struct ctl_table_header *p)
999{
1000 if (unlikely(p->unregistering))
1001 return 0;
1002 p->used++;
1003 return 1;
1004}
1005
1006/* called under sysctl_lock */
1007static void unuse_table(struct ctl_table_header *p)
1008{
1009 if (!--p->used)
1010 if (unlikely(p->unregistering))
1011 complete(p->unregistering);
1012}
1013
1014/* called under sysctl_lock, will reacquire if has to wait */
1015static void start_unregistering(struct ctl_table_header *p)
1016{
1017 /*
1018 * if p->used is 0, nobody will ever touch that entry again;
1019 * we'll eliminate all paths to it before dropping sysctl_lock
1020 */
1021 if (unlikely(p->used)) {
1022 struct completion wait;
1023 init_completion(&wait);
1024 p->unregistering = &wait;
1025 spin_unlock(&sysctl_lock);
1026 wait_for_completion(&wait);
1027 spin_lock(&sysctl_lock);
1028 }
1029 /*
1030 * do not remove from the list until nobody holds it; walking the
1031 * list in do_sysctl() relies on that.
1032 */
1033 list_del_init(&p->ctl_entry);
1034}
1035
995void __init sysctl_init(void) 1036void __init sysctl_init(void)
996{ 1037{
997#ifdef CONFIG_PROC_FS 1038#ifdef CONFIG_PROC_FS
998 register_proc_table(root_table, proc_sys_root); 1039 register_proc_table(root_table, proc_sys_root, &root_table_header);
999 init_irq_proc(); 1040 init_irq_proc();
1000#endif 1041#endif
1001} 1042}
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1004 void __user *newval, size_t newlen) 1045 void __user *newval, size_t newlen)
1005{ 1046{
1006 struct list_head *tmp; 1047 struct list_head *tmp;
1048 int error = -ENOTDIR;
1007 1049
1008 if (nlen <= 0 || nlen >= CTL_MAXNAME) 1050 if (nlen <= 0 || nlen >= CTL_MAXNAME)
1009 return -ENOTDIR; 1051 return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
1012 if (!oldlenp || get_user(old_len, oldlenp)) 1054 if (!oldlenp || get_user(old_len, oldlenp))
1013 return -EFAULT; 1055 return -EFAULT;
1014 } 1056 }
1057 spin_lock(&sysctl_lock);
1015 tmp = &root_table_header.ctl_entry; 1058 tmp = &root_table_header.ctl_entry;
1016 do { 1059 do {
1017 struct ctl_table_header *head = 1060 struct ctl_table_header *head =
1018 list_entry(tmp, struct ctl_table_header, ctl_entry); 1061 list_entry(tmp, struct ctl_table_header, ctl_entry);
1019 void *context = NULL; 1062 void *context = NULL;
1020 int error = parse_table(name, nlen, oldval, oldlenp, 1063
1064 if (!use_table(head))
1065 continue;
1066
1067 spin_unlock(&sysctl_lock);
1068
1069 error = parse_table(name, nlen, oldval, oldlenp,
1021 newval, newlen, head->ctl_table, 1070 newval, newlen, head->ctl_table,
1022 &context); 1071 &context);
1023 kfree(context); 1072 kfree(context);
1073
1074 spin_lock(&sysctl_lock);
1075 unuse_table(head);
1024 if (error != -ENOTDIR) 1076 if (error != -ENOTDIR)
1025 return error; 1077 break;
1026 tmp = tmp->next; 1078 } while ((tmp = tmp->next) != &root_table_header.ctl_entry);
1027 } while (tmp != &root_table_header.ctl_entry); 1079 spin_unlock(&sysctl_lock);
1028 return -ENOTDIR; 1080 return error;
1029} 1081}
1030 1082
1031asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 1083asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1236 return NULL; 1288 return NULL;
1237 tmp->ctl_table = table; 1289 tmp->ctl_table = table;
1238 INIT_LIST_HEAD(&tmp->ctl_entry); 1290 INIT_LIST_HEAD(&tmp->ctl_entry);
1291 tmp->used = 0;
1292 tmp->unregistering = NULL;
1293 spin_lock(&sysctl_lock);
1239 if (insert_at_head) 1294 if (insert_at_head)
1240 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry); 1295 list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
1241 else 1296 else
1242 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1297 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1298 spin_unlock(&sysctl_lock);
1243#ifdef CONFIG_PROC_FS 1299#ifdef CONFIG_PROC_FS
1244 register_proc_table(table, proc_sys_root); 1300 register_proc_table(table, proc_sys_root, tmp);
1245#endif 1301#endif
1246 return tmp; 1302 return tmp;
1247} 1303}
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1255 */ 1311 */
1256void unregister_sysctl_table(struct ctl_table_header * header) 1312void unregister_sysctl_table(struct ctl_table_header * header)
1257{ 1313{
1258 list_del(&header->ctl_entry); 1314 might_sleep();
1315 spin_lock(&sysctl_lock);
1316 start_unregistering(header);
1259#ifdef CONFIG_PROC_FS 1317#ifdef CONFIG_PROC_FS
1260 unregister_proc_table(header->ctl_table, proc_sys_root); 1318 unregister_proc_table(header->ctl_table, proc_sys_root);
1261#endif 1319#endif
1320 spin_unlock(&sysctl_lock);
1262 kfree(header); 1321 kfree(header);
1263} 1322}
1264 1323
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1269#ifdef CONFIG_PROC_FS 1328#ifdef CONFIG_PROC_FS
1270 1329
1271/* Scan the sysctl entries in table and add them all into /proc */ 1330/* Scan the sysctl entries in table and add them all into /proc */
1272static void register_proc_table(ctl_table * table, struct proc_dir_entry *root) 1331static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
1273{ 1332{
1274 struct proc_dir_entry *de; 1333 struct proc_dir_entry *de;
1275 int len; 1334 int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
1305 de = create_proc_entry(table->procname, mode, root); 1364 de = create_proc_entry(table->procname, mode, root);
1306 if (!de) 1365 if (!de)
1307 continue; 1366 continue;
1367 de->set = set;
1308 de->data = (void *) table; 1368 de->data = (void *) table;
1309 if (table->proc_handler) 1369 if (table->proc_handler)
1310 de->proc_fops = &proc_sys_file_operations; 1370 de->proc_fops = &proc_sys_file_operations;
1311 } 1371 }
1312 table->de = de; 1372 table->de = de;
1313 if (de->mode & S_IFDIR) 1373 if (de->mode & S_IFDIR)
1314 register_proc_table(table->child, de); 1374 register_proc_table(table->child, de, set);
1315 } 1375 }
1316} 1376}
1317 1377
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
1336 continue; 1396 continue;
1337 } 1397 }
1338 1398
1399 /*
1400 * In any case, mark the entry as goner; we'll keep it
1401 * around if it's busy, but we'll know to do nothing with
1402 * its fields. We are under sysctl_lock here.
1403 */
1404 de->data = NULL;
1405
1339 /* Don't unregister proc entries that are still being used.. */ 1406 /* Don't unregister proc entries that are still being used.. */
1340 if (atomic_read(&de->count)) 1407 if (atomic_read(&de->count))
1341 continue; 1408 continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
1349 size_t count, loff_t *ppos) 1416 size_t count, loff_t *ppos)
1350{ 1417{
1351 int op; 1418 int op;
1352 struct proc_dir_entry *de; 1419 struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
1353 struct ctl_table *table; 1420 struct ctl_table *table;
1354 size_t res; 1421 size_t res;
1355 ssize_t error; 1422 ssize_t error = -ENOTDIR;
1356
1357 de = PDE(file->f_dentry->d_inode);
1358 if (!de || !de->data)
1359 return -ENOTDIR;
1360 table = (struct ctl_table *) de->data;
1361 if (!table || !table->proc_handler)
1362 return -ENOTDIR;
1363 op = (write ? 002 : 004);
1364 if (ctl_perm(table, op))
1365 return -EPERM;
1366 1423
1367 res = count; 1424 spin_lock(&sysctl_lock);
1368 1425 if (de && de->data && use_table(de->set)) {
1369 error = (*table->proc_handler) (table, write, file, buf, &res, ppos); 1426 /*
1370 if (error) 1427 * at that point we know that sysctl was not unregistered
1371 return error; 1428 * and won't be until we finish
1372 return res; 1429 */
1430 spin_unlock(&sysctl_lock);
1431 table = (struct ctl_table *) de->data;
1432 if (!table || !table->proc_handler)
1433 goto out;
1434 error = -EPERM;
1435 op = (write ? 002 : 004);
1436 if (ctl_perm(table, op))
1437 goto out;
1438
1439 /* careful: calling conventions are nasty here */
1440 res = count;
1441 error = (*table->proc_handler)(table, write, file,
1442 buf, &res, ppos);
1443 if (!error)
1444 error = res;
1445 out:
1446 spin_lock(&sysctl_lock);
1447 unuse_table(de->set);
1448 }
1449 spin_unlock(&sysctl_lock);
1450 return error;
1373} 1451}
1374 1452
1375static int proc_opensys(struct inode *inode, struct file *file) 1453static int proc_opensys(struct inode *inode, struct file *file)
@@ -1997,6 +2075,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
1997 * @filp: the file structure 2075 * @filp: the file structure
1998 * @buffer: the user buffer 2076 * @buffer: the user buffer
1999 * @lenp: the size of the user buffer 2077 * @lenp: the size of the user buffer
2078 * @ppos: pointer to the file position
2000 * 2079 *
2001 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer 2080 * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
2002 * values from/to the user buffer, treated as an ASCII string. 2081 * values from/to the user buffer, treated as an ASCII string.
diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
338 if (mtemp >= MINSEC) { 338 if (mtemp >= MINSEC) {
339 ltemp = (time_offset / mtemp) << (SHIFT_USEC - 339 ltemp = (time_offset / mtemp) << (SHIFT_USEC -
340 SHIFT_UPDATE); 340 SHIFT_UPDATE);
341 if (ltemp < 0) 341 time_freq += shift_right(ltemp, SHIFT_KH);
342 time_freq -= -ltemp >> SHIFT_KH;
343 else
344 time_freq += ltemp >> SHIFT_KH;
345 } else /* calibration interval too short (p. 12) */ 342 } else /* calibration interval too short (p. 12) */
346 result = TIME_ERROR; 343 result = TIME_ERROR;
347 } else { /* PLL mode */ 344 } else { /* PLL mode */
348 if (mtemp < MAXSEC) { 345 if (mtemp < MAXSEC) {
349 ltemp *= mtemp; 346 ltemp *= mtemp;
350 if (ltemp < 0) 347 time_freq += shift_right(ltemp,(time_constant +
351 time_freq -= -ltemp >> (time_constant +
352 time_constant +
353 SHIFT_KF - SHIFT_USEC);
354 else
355 time_freq += ltemp >> (time_constant +
356 time_constant + 348 time_constant +
357 SHIFT_KF - SHIFT_USEC); 349 SHIFT_KF - SHIFT_USEC));
358 } else /* calibration interval too long (p. 12) */ 350 } else /* calibration interval too long (p. 12) */
359 result = TIME_ERROR; 351 result = TIME_ERROR;
360 } 352 }
361 if (time_freq > time_tolerance) 353 time_freq = min(time_freq, time_tolerance);
362 time_freq = time_tolerance; 354 time_freq = max(time_freq, -time_tolerance);
363 else if (time_freq < -time_tolerance)
364 time_freq = -time_tolerance;
365 } /* STA_PLL || STA_PPSTIME */ 355 } /* STA_PLL || STA_PPSTIME */
366 } /* txc->modes & ADJ_OFFSET */ 356 } /* txc->modes & ADJ_OFFSET */
367 if (txc->modes & ADJ_TICK) { 357 if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
384 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 374 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
385 txc->offset = save_adjust; 375 txc->offset = save_adjust;
386 else { 376 else {
387 if (time_offset < 0) 377 txc->offset = shift_right(time_offset, SHIFT_UPDATE);
388 txc->offset = -(-time_offset >> SHIFT_UPDATE);
389 else
390 txc->offset = time_offset >> SHIFT_UPDATE;
391 } 378 }
392 txc->freq = time_freq + pps_freq; 379 txc->freq = time_freq + pps_freq;
393 txc->maxerror = time_maxerror; 380 txc->maxerror = time_maxerror;
@@ -532,6 +519,7 @@ int do_settimeofday (struct timespec *tv)
532 clock_was_set(); 519 clock_was_set();
533 return 0; 520 return 0;
534} 521}
522EXPORT_SYMBOL(do_settimeofday);
535 523
536void do_gettimeofday (struct timeval *tv) 524void do_gettimeofday (struct timeval *tv)
537{ 525{
@@ -570,6 +558,7 @@ void getnstimeofday(struct timespec *tv)
570 tv->tv_sec = x.tv_sec; 558 tv->tv_sec = x.tv_sec;
571 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC; 559 tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
572} 560}
561EXPORT_SYMBOL_GPL(getnstimeofday);
573#endif 562#endif
574 563
575#if (BITS_PER_LONG < 64) 564#if (BITS_PER_LONG < 64)
diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
46#define time_interpolator_update(x) 46#define time_interpolator_update(x)
47#endif 47#endif
48 48
49u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
50
51EXPORT_SYMBOL(jiffies_64);
52
49/* 53/*
50 * per-CPU timer vector definitions: 54 * per-CPU timer vector definitions:
51 */ 55 */
@@ -91,30 +95,6 @@ static inline void set_running_timer(tvec_base_t *base,
91#endif 95#endif
92} 96}
93 97
94static void check_timer_failed(struct timer_list *timer)
95{
96 static int whine_count;
97 if (whine_count < 16) {
98 whine_count++;
99 printk("Uninitialised timer!\n");
100 printk("This is just a warning. Your computer is OK\n");
101 printk("function=0x%p, data=0x%lx\n",
102 timer->function, timer->data);
103 dump_stack();
104 }
105 /*
106 * Now fix it up
107 */
108 timer->magic = TIMER_MAGIC;
109}
110
111static inline void check_timer(struct timer_list *timer)
112{
113 if (timer->magic != TIMER_MAGIC)
114 check_timer_failed(timer);
115}
116
117
118static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 98static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
119{ 99{
120 unsigned long expires = timer->expires; 100 unsigned long expires = timer->expires;
@@ -177,7 +157,6 @@ void fastcall init_timer(struct timer_list *timer)
177{ 157{
178 timer->entry.next = NULL; 158 timer->entry.next = NULL;
179 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base; 159 timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
180 timer->magic = TIMER_MAGIC;
181} 160}
182EXPORT_SYMBOL(init_timer); 161EXPORT_SYMBOL(init_timer);
183 162
@@ -230,7 +209,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
230 int ret = 0; 209 int ret = 0;
231 210
232 BUG_ON(!timer->function); 211 BUG_ON(!timer->function);
233 check_timer(timer);
234 212
235 base = lock_timer_base(timer, &flags); 213 base = lock_timer_base(timer, &flags);
236 214
@@ -283,9 +261,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
283 unsigned long flags; 261 unsigned long flags;
284 262
285 BUG_ON(timer_pending(timer) || !timer->function); 263 BUG_ON(timer_pending(timer) || !timer->function);
286
287 check_timer(timer);
288
289 spin_lock_irqsave(&base->t_base.lock, flags); 264 spin_lock_irqsave(&base->t_base.lock, flags);
290 timer->base = &base->t_base; 265 timer->base = &base->t_base;
291 internal_add_timer(base, timer); 266 internal_add_timer(base, timer);
@@ -316,8 +291,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
316{ 291{
317 BUG_ON(!timer->function); 292 BUG_ON(!timer->function);
318 293
319 check_timer(timer);
320
321 /* 294 /*
322 * This is a common optimization triggered by the 295 * This is a common optimization triggered by the
323 * networking code - if the timer is re-modified 296 * networking code - if the timer is re-modified
@@ -348,8 +321,6 @@ int del_timer(struct timer_list *timer)
348 unsigned long flags; 321 unsigned long flags;
349 int ret = 0; 322 int ret = 0;
350 323
351 check_timer(timer);
352
353 if (timer_pending(timer)) { 324 if (timer_pending(timer)) {
354 base = lock_timer_base(timer, &flags); 325 base = lock_timer_base(timer, &flags);
355 if (timer_pending(timer)) { 326 if (timer_pending(timer)) {
@@ -412,8 +383,6 @@ out:
412 */ 383 */
413int del_timer_sync(struct timer_list *timer) 384int del_timer_sync(struct timer_list *timer)
414{ 385{
415 check_timer(timer);
416
417 for (;;) { 386 for (;;) {
418 int ret = try_to_del_timer_sync(timer); 387 int ret = try_to_del_timer_sync(timer);
419 if (ret >= 0) 388 if (ret >= 0)
@@ -632,134 +601,118 @@ long time_next_adjust;
632 */ 601 */
633static void second_overflow(void) 602static void second_overflow(void)
634{ 603{
635 long ltemp; 604 long ltemp;
636 605
637 /* Bump the maxerror field */ 606 /* Bump the maxerror field */
638 time_maxerror += time_tolerance >> SHIFT_USEC; 607 time_maxerror += time_tolerance >> SHIFT_USEC;
639 if ( time_maxerror > NTP_PHASE_LIMIT ) { 608 if (time_maxerror > NTP_PHASE_LIMIT) {
640 time_maxerror = NTP_PHASE_LIMIT; 609 time_maxerror = NTP_PHASE_LIMIT;
641 time_status |= STA_UNSYNC; 610 time_status |= STA_UNSYNC;
642 }
643
644 /*
645 * Leap second processing. If in leap-insert state at
646 * the end of the day, the system clock is set back one
647 * second; if in leap-delete state, the system clock is
648 * set ahead one second. The microtime() routine or
649 * external clock driver will insure that reported time
650 * is always monotonic. The ugly divides should be
651 * replaced.
652 */
653 switch (time_state) {
654
655 case TIME_OK:
656 if (time_status & STA_INS)
657 time_state = TIME_INS;
658 else if (time_status & STA_DEL)
659 time_state = TIME_DEL;
660 break;
661
662 case TIME_INS:
663 if (xtime.tv_sec % 86400 == 0) {
664 xtime.tv_sec--;
665 wall_to_monotonic.tv_sec++;
666 /* The timer interpolator will make time change gradually instead
667 * of an immediate jump by one second.
668 */
669 time_interpolator_update(-NSEC_PER_SEC);
670 time_state = TIME_OOP;
671 clock_was_set();
672 printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
673 } 611 }
674 break; 612
675 613 /*
676 case TIME_DEL: 614 * Leap second processing. If in leap-insert state at the end of the
677 if ((xtime.tv_sec + 1) % 86400 == 0) { 615 * day, the system clock is set back one second; if in leap-delete
678 xtime.tv_sec++; 616 * state, the system clock is set ahead one second. The microtime()
679 wall_to_monotonic.tv_sec--; 617 * routine or external clock driver will insure that reported time is
680 /* Use of time interpolator for a gradual change of time */ 618 * always monotonic. The ugly divides should be replaced.
681 time_interpolator_update(NSEC_PER_SEC); 619 */
682 time_state = TIME_WAIT; 620 switch (time_state) {
683 clock_was_set(); 621 case TIME_OK:
684 printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); 622 if (time_status & STA_INS)
623 time_state = TIME_INS;
624 else if (time_status & STA_DEL)
625 time_state = TIME_DEL;
626 break;
627 case TIME_INS:
628 if (xtime.tv_sec % 86400 == 0) {
629 xtime.tv_sec--;
630 wall_to_monotonic.tv_sec++;
631 /*
632 * The timer interpolator will make time change
633 * gradually instead of an immediate jump by one second
634 */
635 time_interpolator_update(-NSEC_PER_SEC);
636 time_state = TIME_OOP;
637 clock_was_set();
638 printk(KERN_NOTICE "Clock: inserting leap second "
639 "23:59:60 UTC\n");
640 }
641 break;
642 case TIME_DEL:
643 if ((xtime.tv_sec + 1) % 86400 == 0) {
644 xtime.tv_sec++;
645 wall_to_monotonic.tv_sec--;
646 /*
647 * Use of time interpolator for a gradual change of
648 * time
649 */
650 time_interpolator_update(NSEC_PER_SEC);
651 time_state = TIME_WAIT;
652 clock_was_set();
653 printk(KERN_NOTICE "Clock: deleting leap second "
654 "23:59:59 UTC\n");
655 }
656 break;
657 case TIME_OOP:
658 time_state = TIME_WAIT;
659 break;
660 case TIME_WAIT:
661 if (!(time_status & (STA_INS | STA_DEL)))
662 time_state = TIME_OK;
685 } 663 }
686 break; 664
687 665 /*
688 case TIME_OOP: 666 * Compute the phase adjustment for the next second. In PLL mode, the
689 time_state = TIME_WAIT; 667 * offset is reduced by a fixed factor times the time constant. In FLL
690 break; 668 * mode the offset is used directly. In either mode, the maximum phase
691 669 * adjustment for each second is clamped so as to spread the adjustment
692 case TIME_WAIT: 670 * over not more than the number of seconds between updates.
693 if (!(time_status & (STA_INS | STA_DEL))) 671 */
694 time_state = TIME_OK;
695 }
696
697 /*
698 * Compute the phase adjustment for the next second. In
699 * PLL mode, the offset is reduced by a fixed factor
700 * times the time constant. In FLL mode the offset is
701 * used directly. In either mode, the maximum phase
702 * adjustment for each second is clamped so as to spread
703 * the adjustment over not more than the number of
704 * seconds between updates.
705 */
706 if (time_offset < 0) {
707 ltemp = -time_offset;
708 if (!(time_status & STA_FLL))
709 ltemp >>= SHIFT_KG + time_constant;
710 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
711 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
712 time_offset += ltemp;
713 time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
714 } else {
715 ltemp = time_offset; 672 ltemp = time_offset;
716 if (!(time_status & STA_FLL)) 673 if (!(time_status & STA_FLL))
717 ltemp >>= SHIFT_KG + time_constant; 674 ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
718 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) 675 ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
719 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; 676 ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
720 time_offset -= ltemp; 677 time_offset -= ltemp;
721 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE); 678 time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
722 } 679
723 680 /*
724 /* 681 * Compute the frequency estimate and additional phase adjustment due
725 * Compute the frequency estimate and additional phase 682 * to frequency error for the next second. When the PPS signal is
726 * adjustment due to frequency error for the next 683 * engaged, gnaw on the watchdog counter and update the frequency
727 * second. When the PPS signal is engaged, gnaw on the 684 * computed by the pll and the PPS signal.
728 * watchdog counter and update the frequency computed by 685 */
729 * the pll and the PPS signal. 686 pps_valid++;
730 */ 687 if (pps_valid == PPS_VALID) { /* PPS signal lost */
731 pps_valid++; 688 pps_jitter = MAXTIME;
732 if (pps_valid == PPS_VALID) { /* PPS signal lost */ 689 pps_stabil = MAXFREQ;
733 pps_jitter = MAXTIME; 690 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
734 pps_stabil = MAXFREQ; 691 STA_PPSWANDER | STA_PPSERROR);
735 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | 692 }
736 STA_PPSWANDER | STA_PPSERROR); 693 ltemp = time_freq + pps_freq;
737 } 694 time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
738 ltemp = time_freq + pps_freq;
739 if (ltemp < 0)
740 time_adj -= -ltemp >>
741 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
742 else
743 time_adj += ltemp >>
744 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
745 695
746#if HZ == 100 696#if HZ == 100
747 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). 697 /*
748 * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14) 698 * Compensate for (HZ==100) != (1 << SHIFT_HZ). Add 25% and 3.125% to
749 */ 699 * get 128.125; => only 0.125% error (p. 14)
750 if (time_adj < 0) 700 */
751 time_adj -= (-time_adj >> 2) + (-time_adj >> 5); 701 time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
752 else 702#endif
753 time_adj += (time_adj >> 2) + (time_adj >> 5); 703#if HZ == 250
704 /*
705 * Compensate for (HZ==250) != (1 << SHIFT_HZ). Add 1.5625% and
706 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
707 */
708 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
754#endif 709#endif
755#if HZ == 1000 710#if HZ == 1000
756 /* Compensate for (HZ==1000) != (1 << SHIFT_HZ). 711 /*
757 * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14) 712 * Compensate for (HZ==1000) != (1 << SHIFT_HZ). Add 1.5625% and
758 */ 713 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
759 if (time_adj < 0) 714 */
760 time_adj -= (-time_adj >> 6) + (-time_adj >> 7); 715 time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
761 else
762 time_adj += (time_adj >> 6) + (time_adj >> 7);
763#endif 716#endif
764} 717}
765 718
@@ -768,23 +721,20 @@ static void update_wall_time_one_tick(void)
768{ 721{
769 long time_adjust_step, delta_nsec; 722 long time_adjust_step, delta_nsec;
770 723
771 if ( (time_adjust_step = time_adjust) != 0 ) { 724 if ((time_adjust_step = time_adjust) != 0 ) {
772 /* We are doing an adjtime thing. 725 /*
773 * 726 * We are doing an adjtime thing. Prepare time_adjust_step to
774 * Prepare time_adjust_step to be within bounds. 727 * be within bounds. Note that a positive time_adjust means we
775 * Note that a positive time_adjust means we want the clock 728 * want the clock to run faster.
776 * to run faster. 729 *
777 * 730 * Limit the amount of the step to be in the range
778 * Limit the amount of the step to be in the range 731 * -tickadj .. +tickadj
779 * -tickadj .. +tickadj 732 */
780 */ 733 time_adjust_step = min(time_adjust_step, (long)tickadj);
781 if (time_adjust > tickadj) 734 time_adjust_step = max(time_adjust_step, (long)-tickadj);
782 time_adjust_step = tickadj; 735
783 else if (time_adjust < -tickadj) 736 /* Reduce by this step the amount of time left */
784 time_adjust_step = -tickadj; 737 time_adjust -= time_adjust_step;
785
786 /* Reduce by this step the amount of time left */
787 time_adjust -= time_adjust_step;
788 } 738 }
789 delta_nsec = tick_nsec + time_adjust_step * 1000; 739 delta_nsec = tick_nsec + time_adjust_step * 1000;
790 /* 740 /*
@@ -792,13 +742,8 @@ static void update_wall_time_one_tick(void)
792 * advance the tick more. 742 * advance the tick more.
793 */ 743 */
794 time_phase += time_adj; 744 time_phase += time_adj;
795 if (time_phase <= -FINENSEC) { 745 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
796 long ltemp = -time_phase >> (SHIFT_SCALE - 10); 746 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
797 time_phase += ltemp << (SHIFT_SCALE - 10);
798 delta_nsec -= ltemp;
799 }
800 else if (time_phase >= FINENSEC) {
801 long ltemp = time_phase >> (SHIFT_SCALE - 10);
802 time_phase -= ltemp << (SHIFT_SCALE - 10); 747 time_phase -= ltemp << (SHIFT_SCALE - 10);
803 delta_nsec += ltemp; 748 delta_nsec += ltemp;
804 } 749 }
@@ -1128,8 +1073,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1128 if (timeout < 0) 1073 if (timeout < 0)
1129 { 1074 {
1130 printk(KERN_ERR "schedule_timeout: wrong timeout " 1075 printk(KERN_ERR "schedule_timeout: wrong timeout "
1131 "value %lx from %p\n", timeout, 1076 "value %lx from %p\n", timeout,
1132 __builtin_return_address(0)); 1077 __builtin_return_address(0));
1133 current->state = TASK_RUNNING; 1078 current->state = TASK_RUNNING;
1134 goto out; 1079 goto out;
1135 } 1080 }
@@ -1137,12 +1082,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1137 1082
1138 expire = timeout + jiffies; 1083 expire = timeout + jiffies;
1139 1084
1140 init_timer(&timer); 1085 setup_timer(&timer, process_timeout, (unsigned long)current);
1141 timer.expires = expire; 1086 __mod_timer(&timer, expire);
1142 timer.data = (unsigned long) current;
1143 timer.function = process_timeout;
1144
1145 add_timer(&timer);
1146 schedule(); 1087 schedule();
1147 del_singleshot_timer_sync(&timer); 1088 del_singleshot_timer_sync(&timer);
1148 1089
@@ -1159,15 +1100,15 @@ EXPORT_SYMBOL(schedule_timeout);
1159 */ 1100 */
1160signed long __sched schedule_timeout_interruptible(signed long timeout) 1101signed long __sched schedule_timeout_interruptible(signed long timeout)
1161{ 1102{
1162 __set_current_state(TASK_INTERRUPTIBLE); 1103 __set_current_state(TASK_INTERRUPTIBLE);
1163 return schedule_timeout(timeout); 1104 return schedule_timeout(timeout);
1164} 1105}
1165EXPORT_SYMBOL(schedule_timeout_interruptible); 1106EXPORT_SYMBOL(schedule_timeout_interruptible);
1166 1107
1167signed long __sched schedule_timeout_uninterruptible(signed long timeout) 1108signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1168{ 1109{
1169 __set_current_state(TASK_UNINTERRUPTIBLE); 1110 __set_current_state(TASK_UNINTERRUPTIBLE);
1170 return schedule_timeout(timeout); 1111 return schedule_timeout(timeout);
1171} 1112}
1172EXPORT_SYMBOL(schedule_timeout_uninterruptible); 1113EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1173 1114
@@ -1507,16 +1448,18 @@ static void time_interpolator_update(long delta_nsec)
1507 if (!time_interpolator) 1448 if (!time_interpolator)
1508 return; 1449 return;
1509 1450
1510 /* The interpolator compensates for late ticks by accumulating 1451 /*
1511 * the late time in time_interpolator->offset. A tick earlier than 1452 * The interpolator compensates for late ticks by accumulating the late
1512 * expected will lead to a reset of the offset and a corresponding 1453 * time in time_interpolator->offset. A tick earlier than expected will
1513 * jump of the clock forward. Again this only works if the 1454 * lead to a reset of the offset and a corresponding jump of the clock
1514 * interpolator clock is running slightly slower than the regular clock 1455 * forward. Again this only works if the interpolator clock is running
1515 * and the tuning logic insures that. 1456 * slightly slower than the regular clock and the tuning logic insures
1516 */ 1457 * that.
1458 */
1517 1459
1518 counter = time_interpolator_get_counter(1); 1460 counter = time_interpolator_get_counter(1);
1519 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1461 offset = time_interpolator->offset +
1462 GET_TI_NSECS(counter, time_interpolator);
1520 1463
1521 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1464 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1522 time_interpolator->offset = offset - delta_nsec; 1465 time_interpolator->offset = offset - delta_nsec;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e2..2bd5aee1c736 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton <andrewm@uow.edu.au>
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 *
16 * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
15 */ 17 */
16 18
17#include <linux/module.h> 19#include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
57 * per-CPU workqueues: 59 * per-CPU workqueues:
58 */ 60 */
59struct workqueue_struct { 61struct workqueue_struct {
60 struct cpu_workqueue_struct cpu_wq[NR_CPUS]; 62 struct cpu_workqueue_struct *cpu_wq;
61 const char *name; 63 const char *name;
62 struct list_head list; /* Empty if single thread */ 64 struct list_head list; /* Empty if single thread */
63}; 65};
@@ -100,9 +102,9 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
100 102
101 if (!test_and_set_bit(0, &work->pending)) { 103 if (!test_and_set_bit(0, &work->pending)) {
102 if (unlikely(is_single_threaded(wq))) 104 if (unlikely(is_single_threaded(wq)))
103 cpu = 0; 105 cpu = any_online_cpu(cpu_online_map);
104 BUG_ON(!list_empty(&work->entry)); 106 BUG_ON(!list_empty(&work->entry));
105 __queue_work(wq->cpu_wq + cpu, work); 107 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
106 ret = 1; 108 ret = 1;
107 } 109 }
108 put_cpu(); 110 put_cpu();
@@ -116,9 +118,9 @@ static void delayed_work_timer_fn(unsigned long __data)
116 int cpu = smp_processor_id(); 118 int cpu = smp_processor_id();
117 119
118 if (unlikely(is_single_threaded(wq))) 120 if (unlikely(is_single_threaded(wq)))
119 cpu = 0; 121 cpu = any_online_cpu(cpu_online_map);
120 122
121 __queue_work(wq->cpu_wq + cpu, work); 123 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
122} 124}
123 125
124int fastcall queue_delayed_work(struct workqueue_struct *wq, 126int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -264,14 +266,14 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
264 might_sleep(); 266 might_sleep();
265 267
266 if (is_single_threaded(wq)) { 268 if (is_single_threaded(wq)) {
267 /* Always use cpu 0's area. */ 269 /* Always use first cpu's area. */
268 flush_cpu_workqueue(wq->cpu_wq + 0); 270 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map)));
269 } else { 271 } else {
270 int cpu; 272 int cpu;
271 273
272 lock_cpu_hotplug(); 274 lock_cpu_hotplug();
273 for_each_online_cpu(cpu) 275 for_each_online_cpu(cpu)
274 flush_cpu_workqueue(wq->cpu_wq + cpu); 276 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
275 unlock_cpu_hotplug(); 277 unlock_cpu_hotplug();
276 } 278 }
277} 279}
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
279static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 281static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
280 int cpu) 282 int cpu)
281{ 283{
282 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; 284 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
283 struct task_struct *p; 285 struct task_struct *p;
284 286
285 spin_lock_init(&cwq->lock); 287 spin_lock_init(&cwq->lock);
@@ -312,12 +314,13 @@ struct workqueue_struct *__create_workqueue(const char *name,
312 if (!wq) 314 if (!wq)
313 return NULL; 315 return NULL;
314 316
317 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
315 wq->name = name; 318 wq->name = name;
316 /* We don't need the distraction of CPUs appearing and vanishing. */ 319 /* We don't need the distraction of CPUs appearing and vanishing. */
317 lock_cpu_hotplug(); 320 lock_cpu_hotplug();
318 if (singlethread) { 321 if (singlethread) {
319 INIT_LIST_HEAD(&wq->list); 322 INIT_LIST_HEAD(&wq->list);
320 p = create_workqueue_thread(wq, 0); 323 p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map));
321 if (!p) 324 if (!p)
322 destroy = 1; 325 destroy = 1;
323 else 326 else
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
353 unsigned long flags; 356 unsigned long flags;
354 struct task_struct *p; 357 struct task_struct *p;
355 358
356 cwq = wq->cpu_wq + cpu; 359 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
357 spin_lock_irqsave(&cwq->lock, flags); 360 spin_lock_irqsave(&cwq->lock, flags);
358 p = cwq->thread; 361 p = cwq->thread;
359 cwq->thread = NULL; 362 cwq->thread = NULL;
@@ -371,7 +374,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
371 /* We don't need the distraction of CPUs appearing and vanishing. */ 374 /* We don't need the distraction of CPUs appearing and vanishing. */
372 lock_cpu_hotplug(); 375 lock_cpu_hotplug();
373 if (is_single_threaded(wq)) 376 if (is_single_threaded(wq))
374 cleanup_workqueue_thread(wq, 0); 377 cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map));
375 else { 378 else {
376 for_each_online_cpu(cpu) 379 for_each_online_cpu(cpu)
377 cleanup_workqueue_thread(wq, cpu); 380 cleanup_workqueue_thread(wq, cpu);
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
380 spin_unlock(&workqueue_lock); 383 spin_unlock(&workqueue_lock);
381 } 384 }
382 unlock_cpu_hotplug(); 385 unlock_cpu_hotplug();
386 free_percpu(wq->cpu_wq);
383 kfree(wq); 387 kfree(wq);
384} 388}
385 389
@@ -458,7 +462,7 @@ int current_is_keventd(void)
458 462
459 BUG_ON(!keventd_wq); 463 BUG_ON(!keventd_wq);
460 464
461 cwq = keventd_wq->cpu_wq + cpu; 465 cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
462 if (current == cwq->thread) 466 if (current == cwq->thread)
463 ret = 1; 467 ret = 1;
464 468
@@ -470,7 +474,7 @@ int current_is_keventd(void)
470/* Take the work from this (downed) CPU. */ 474/* Take the work from this (downed) CPU. */
471static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 475static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
472{ 476{
473 struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu; 477 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
474 LIST_HEAD(list); 478 LIST_HEAD(list);
475 struct work_struct *work; 479 struct work_struct *work;
476 480
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
481 printk("Taking work for %s\n", wq->name); 485 printk("Taking work for %s\n", wq->name);
482 work = list_entry(list.next,struct work_struct,entry); 486 work = list_entry(list.next,struct work_struct,entry);
483 list_del(&work->entry); 487 list_del(&work->entry);
484 __queue_work(wq->cpu_wq + smp_processor_id(), work); 488 __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
485 } 489 }
486 spin_unlock_irq(&cwq->lock); 490 spin_unlock_irq(&cwq->lock);
487} 491}
@@ -508,16 +512,19 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
508 case CPU_ONLINE: 512 case CPU_ONLINE:
509 /* Kick off worker threads. */ 513 /* Kick off worker threads. */
510 list_for_each_entry(wq, &workqueues, list) { 514 list_for_each_entry(wq, &workqueues, list) {
511 kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu); 515 struct cpu_workqueue_struct *cwq;
512 wake_up_process(wq->cpu_wq[hotcpu].thread); 516
517 cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
518 kthread_bind(cwq->thread, hotcpu);
519 wake_up_process(cwq->thread);
513 } 520 }
514 break; 521 break;
515 522
516 case CPU_UP_CANCELED: 523 case CPU_UP_CANCELED:
517 list_for_each_entry(wq, &workqueues, list) { 524 list_for_each_entry(wq, &workqueues, list) {
518 /* Unbind so it can run. */ 525 /* Unbind so it can run. */
519 kthread_bind(wq->cpu_wq[hotcpu].thread, 526 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
520 smp_processor_id()); 527 any_online_cpu(cpu_online_map));
521 cleanup_workqueue_thread(wq, hotcpu); 528 cleanup_workqueue_thread(wq, hotcpu);
522 } 529 }
523 break; 530 break;