aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2007-05-10 07:08:37 -0400
committerPaul Mackerras <paulus@samba.org>2007-05-10 07:08:37 -0400
commit2ecf042ef530dd0943e41d84b6344f507941af3e (patch)
tree73100361dd74e3f80f14c7c81ba4675948983f44 /kernel
parent32a56ebb24f23da1bbaf24292acf85b6c04526ab (diff)
parentde5603748af8bf7deac403e6ba92887f8d18e812 (diff)
Merge branch 'linux-2.6'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt4
-rw-r--r--kernel/configs.c15
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c988
-rw-r--r--kernel/futex_compat.c22
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kthread.c113
-rw-r--r--kernel/module.c10
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/power/disk.c195
-rw-r--r--kernel/power/main.c42
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
-rw-r--r--kernel/sched.c38
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c98
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time/clocksource.c51
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/timer.c14
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c783
35 files changed, 1711 insertions, 1101 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 0b46a5dff4c0..c64ce9c14207 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -23,7 +23,7 @@ config PREEMPT_VOLUNTARY
23 "explicit preemption points" to the kernel code. These new 23 "explicit preemption points" to the kernel code. These new
24 preemption points have been selected to reduce the maximum 24 preemption points have been selected to reduce the maximum
25 latency of rescheduling, providing faster application reactions, 25 latency of rescheduling, providing faster application reactions,
26 at the cost of slighly lower throughput. 26 at the cost of slightly lower throughput.
27 27
28 This allows reaction to interactive events by allowing a 28 This allows reaction to interactive events by allowing a
29 low priority process to voluntarily preempt itself even if it 29 low priority process to voluntarily preempt itself even if it
@@ -43,7 +43,7 @@ config PREEMPT
43 even if it is in kernel mode executing a system call and would 43 even if it is in kernel mode executing a system call and would
44 otherwise not be about to reach a natural preemption point. 44 otherwise not be about to reach a natural preemption point.
45 This allows applications to run more 'smoothly' even when the 45 This allows applications to run more 'smoothly' even when the
46 system is under load, at the cost of slighly lower throughput 46 system is under load, at the cost of slightly lower throughput
47 and a slight runtime overhead to kernel code. 47 and a slight runtime overhead to kernel code.
48 48
49 Select this if you are building a kernel for a desktop or 49 Select this if you are building a kernel for a desktop or
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a7..e84d3f9c6c7b 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 61ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 62 size_t len, loff_t * offset)
63{ 63{
64 loff_t pos = *offset; 64 return simple_read_from_buffer(buf, len, offset,
65 ssize_t count; 65 kernel_config_data + MAGIC_SIZE,
66 66 kernel_config_data_size);
67 if (pos >= kernel_config_data_size)
68 return 0;
69
70 count = min(len, (size_t)(kernel_config_data_size - pos));
71 if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
72 return -EFAULT;
73
74 *offset += count;
75 return count;
76} 67}
77 68
78static const struct file_operations ikconfig_file_ops = { 69static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..208cf3497c10 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
97 (!cputime_eq(p->utime, cputime_zero) || 97 (!cputime_eq(p->utime, cputime_zero) ||
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %lx) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, p->pid, cpu, p->state, p->flags);
102 } 102 }
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
120} 120}
121 121
122/* Requires cpu_add_remove_lock to be held */ 122/* Requires cpu_add_remove_lock to be held */
123static int _cpu_down(unsigned int cpu) 123static int _cpu_down(unsigned int cpu, int tasks_frozen)
124{ 124{
125 int err; 125 int err, nr_calls = 0;
126 struct task_struct *p; 126 struct task_struct *p;
127 cpumask_t old_allowed, tmp; 127 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
128 130
129 if (num_online_cpus() == 1) 131 if (num_online_cpus() == 1)
130 return -EBUSY; 132 return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
132 if (!cpu_online(cpu)) 134 if (!cpu_online(cpu))
133 return -EINVAL; 135 return -EINVAL;
134 136
135 err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 137 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
136 (void *)(long)cpu); 138 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
139 hcpu, -1, &nr_calls);
137 if (err == NOTIFY_BAD) { 140 if (err == NOTIFY_BAD) {
141 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
142 hcpu, nr_calls, NULL);
138 printk("%s: attempt to take down CPU %u failed\n", 143 printk("%s: attempt to take down CPU %u failed\n",
139 __FUNCTION__, cpu); 144 __FUNCTION__, cpu);
140 return -EINVAL; 145 err = -EINVAL;
146 goto out_release;
141 } 147 }
142 148
143 /* Ensure that we are not runnable on dying cpu */ 149 /* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
152 158
153 if (IS_ERR(p) || cpu_online(cpu)) { 159 if (IS_ERR(p) || cpu_online(cpu)) {
154 /* CPU didn't die: tell everyone. Can't complain. */ 160 /* CPU didn't die: tell everyone. Can't complain. */
155 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 161 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
156 (void *)(long)cpu) == NOTIFY_BAD) 162 hcpu) == NOTIFY_BAD)
157 BUG(); 163 BUG();
158 164
159 if (IS_ERR(p)) { 165 if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
170 /* This actually kills the CPU. */ 176 /* This actually kills the CPU. */
171 __cpu_die(cpu); 177 __cpu_die(cpu);
172 178
173 /* Move it here so it can run. */
174 kthread_bind(p, get_cpu());
175 put_cpu();
176
177 /* CPU is completely dead: tell everyone. Too late to complain. */ 179 /* CPU is completely dead: tell everyone. Too late to complain. */
178 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, 180 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
179 (void *)(long)cpu) == NOTIFY_BAD) 181 hcpu) == NOTIFY_BAD)
180 BUG(); 182 BUG();
181 183
182 check_for_tasks(cpu); 184 check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
185 err = kthread_stop(p); 187 err = kthread_stop(p);
186out_allowed: 188out_allowed:
187 set_cpus_allowed(current, old_allowed); 189 set_cpus_allowed(current, old_allowed);
190out_release:
191 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
188 return err; 192 return err;
189} 193}
190 194
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
196 if (cpu_hotplug_disabled) 200 if (cpu_hotplug_disabled)
197 err = -EBUSY; 201 err = -EBUSY;
198 else 202 else
199 err = _cpu_down(cpu); 203 err = _cpu_down(cpu, 0);
200 204
201 mutex_unlock(&cpu_add_remove_lock); 205 mutex_unlock(&cpu_add_remove_lock);
202 return err; 206 return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
204#endif /*CONFIG_HOTPLUG_CPU*/ 208#endif /*CONFIG_HOTPLUG_CPU*/
205 209
206/* Requires cpu_add_remove_lock to be held */ 210/* Requires cpu_add_remove_lock to be held */
207static int __cpuinit _cpu_up(unsigned int cpu) 211static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
208{ 212{
209 int ret; 213 int ret, nr_calls = 0;
210 void *hcpu = (void *)(long)cpu; 214 void *hcpu = (void *)(long)cpu;
215 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
211 216
212 if (cpu_online(cpu) || !cpu_present(cpu)) 217 if (cpu_online(cpu) || !cpu_present(cpu))
213 return -EINVAL; 218 return -EINVAL;
214 219
215 ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 220 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
221 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
222 -1, &nr_calls);
216 if (ret == NOTIFY_BAD) { 223 if (ret == NOTIFY_BAD) {
217 printk("%s: attempt to bring up CPU %u failed\n", 224 printk("%s: attempt to bring up CPU %u failed\n",
218 __FUNCTION__, cpu); 225 __FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
229 BUG_ON(!cpu_online(cpu)); 236 BUG_ON(!cpu_online(cpu));
230 237
231 /* Now call notifier in preparation. */ 238 /* Now call notifier in preparation. */
232 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 239 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
233 240
234out_notify: 241out_notify:
235 if (ret != 0) 242 if (ret != 0)
236 raw_notifier_call_chain(&cpu_chain, 243 __raw_notifier_call_chain(&cpu_chain,
237 CPU_UP_CANCELED, hcpu); 244 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
245 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
238 246
239 return ret; 247 return ret;
240} 248}
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
247 if (cpu_hotplug_disabled) 255 if (cpu_hotplug_disabled)
248 err = -EBUSY; 256 err = -EBUSY;
249 else 257 else
250 err = _cpu_up(cpu); 258 err = _cpu_up(cpu, 0);
251 259
252 mutex_unlock(&cpu_add_remove_lock); 260 mutex_unlock(&cpu_add_remove_lock);
253 return err; 261 return err;
254} 262}
255 263
256#ifdef CONFIG_SUSPEND_SMP 264#ifdef CONFIG_SUSPEND_SMP
257/* Needed to prevent the microcode driver from requesting firmware in its CPU
258 * hotplug notifier during the suspend/resume.
259 */
260int suspend_cpu_hotplug;
261EXPORT_SYMBOL(suspend_cpu_hotplug);
262
263static cpumask_t frozen_cpus; 265static cpumask_t frozen_cpus;
264 266
265int disable_nonboot_cpus(void) 267int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
267 int cpu, first_cpu, error = 0; 269 int cpu, first_cpu, error = 0;
268 270
269 mutex_lock(&cpu_add_remove_lock); 271 mutex_lock(&cpu_add_remove_lock);
270 suspend_cpu_hotplug = 1;
271 first_cpu = first_cpu(cpu_online_map); 272 first_cpu = first_cpu(cpu_online_map);
272 /* We take down all of the non-boot CPUs in one shot to avoid races 273 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time 274 * with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
277 for_each_online_cpu(cpu) { 278 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu) 279 if (cpu == first_cpu)
279 continue; 280 continue;
280 error = _cpu_down(cpu); 281 error = _cpu_down(cpu, 1);
281 if (!error) { 282 if (!error) {
282 cpu_set(cpu, frozen_cpus); 283 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu); 284 printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
294 } else { 295 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 296 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
296 } 297 }
297 suspend_cpu_hotplug = 0;
298 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
299 return error; 299 return error;
300} 300}
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
309 if (cpus_empty(frozen_cpus)) 309 if (cpus_empty(frozen_cpus))
310 goto out; 310 goto out;
311 311
312 suspend_cpu_hotplug = 1;
313 printk("Enabling non-boot CPUs ...\n"); 312 printk("Enabling non-boot CPUs ...\n");
314 for_each_cpu_mask(cpu, frozen_cpus) { 313 for_each_cpu_mask(cpu, frozen_cpus) {
315 error = _cpu_up(cpu); 314 error = _cpu_up(cpu, 1);
316 if (!error) { 315 if (!error) {
317 printk("CPU%d is up\n", cpu); 316 printk("CPU%d is up\n", cpu);
318 continue; 317 continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
320 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 319 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
321 } 320 }
322 cpus_clear(frozen_cpus); 321 cpus_clear(frozen_cpus);
323 suspend_cpu_hotplug = 0;
324out: 322out:
325 mutex_unlock(&cpu_add_remove_lock); 323 mutex_unlock(&cpu_add_remove_lock);
326} 324}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc72..f57854b08922 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1772{ 1772{
1773 struct ctr_struct *ctr = file->private_data; 1773 struct ctr_struct *ctr = file->private_data;
1774 1774
1775 if (*ppos + nbytes > ctr->bufsz) 1775 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1776 nbytes = ctr->bufsz - *ppos;
1777 if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
1778 return -EFAULT;
1779 *ppos += nbytes;
1780 return nbytes;
1781} 1776}
1782 1777
1783static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) 1778static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f3..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
29#include <linux/kthread.h>
29#include <linux/mempolicy.h> 30#include <linux/mempolicy.h>
30#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
31#include <linux/delayacct.h> 32#include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
254} 255}
255 256
256/** 257/**
257 * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. 258 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
258 * 259 *
259 * If a kernel thread is launched as a result of a system call, or if 260 * If a kernel thread is launched as a result of a system call, or if
260 * it ever exits, it should generally reparent itself to init so that 261 * it ever exits, it should generally reparent itself to kthreadd so it
261 * it is correctly cleaned up on exit. 262 * isn't in the way of other processes and is correctly cleaned up on exit.
262 * 263 *
263 * The various task state such as scheduling policy and priority may have 264 * The various task state such as scheduling policy and priority may have
264 * been inherited from a user process, so we reset them to sane values here. 265 * been inherited from a user process, so we reset them to sane values here.
265 * 266 *
266 * NOTE that reparent_to_init() gives the caller full capabilities. 267 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
267 */ 268 */
268static void reparent_to_init(void) 269static void reparent_to_kthreadd(void)
269{ 270{
270 write_lock_irq(&tasklist_lock); 271 write_lock_irq(&tasklist_lock);
271 272
272 ptrace_unlink(current); 273 ptrace_unlink(current);
273 /* Reparent to init */ 274 /* Reparent to init */
274 remove_parent(current); 275 remove_parent(current);
275 current->parent = child_reaper(current); 276 current->real_parent = current->parent = kthreadd_task;
276 current->real_parent = child_reaper(current);
277 add_parent(current); 277 add_parent(current);
278 278
279 /* Set the exit signal to SIGCHLD so we signal init on exit */ 279 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
347 return -EINVAL; 347 return -EINVAL;
348 348
349 spin_lock_irq(&current->sighand->siglock); 349 spin_lock_irq(&current->sighand->siglock);
350 sigaddset(&current->blocked, sig); 350 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
351 recalc_sigpending(); 351 recalc_sigpending();
352 spin_unlock_irq(&current->sighand->siglock); 352 spin_unlock_irq(&current->sighand->siglock);
353 return 0; 353 return 0;
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
400 current->files = init_task.files; 400 current->files = init_task.files;
401 atomic_inc(&current->files->count); 401 atomic_inc(&current->files->count);
402 402
403 reparent_to_init(); 403 reparent_to_kthreadd();
404} 404}
405 405
406EXPORT_SYMBOL(daemonize); 406EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
105 105
106void free_task(struct task_struct *tsk) 106void free_task(struct task_struct *tsk)
107{ 107{
108 free_thread_info(tsk->thread_info); 108 free_thread_info(tsk->stack);
109 rt_mutex_debug_task_free(tsk); 109 rt_mutex_debug_task_free(tsk);
110 free_task_struct(tsk); 110 free_task_struct(tsk);
111} 111}
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
175 } 175 }
176 176
177 *tsk = *orig; 177 *tsk = *orig;
178 tsk->thread_info = ti; 178 tsk->stack = ti;
179 setup_thread_stack(tsk, orig); 179 setup_thread_stack(tsk, orig);
180 180
181#ifdef CONFIG_CC_STACKPROTECTOR 181#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
53 56
54#include "rtmutex_common.h" 57#include "rtmutex_common.h"
55 58
59#ifdef CONFIG_DEBUG_RT_MUTEXES
60# include "rtmutex-debug.h"
61#else
62# include "rtmutex.h"
63#endif
64
56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
57 66
58/* 67/*
@@ -81,12 +90,12 @@ struct futex_pi_state {
81 * we can wake only the relevant ones (hashed queues may be shared). 90 * we can wake only the relevant ones (hashed queues may be shared).
82 * 91 *
83 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
84 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
85 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
86 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiters, then make the second condition true.
87 */ 96 */
88struct futex_q { 97struct futex_q {
89 struct list_head list; 98 struct plist_node list;
90 wait_queue_head_t waiters; 99 wait_queue_head_t waiters;
91 100
92 /* Which hash list lock to use: */ 101 /* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
102 /* Optional priority inheritance state: */ 111 /* Optional priority inheritance state: */
103 struct futex_pi_state *pi_state; 112 struct futex_pi_state *pi_state;
104 struct task_struct *task; 113 struct task_struct *task;
114
115 /*
116 * This waiter is used in case of requeue from a
117 * normal futex to a PI-futex
118 */
119 struct rt_mutex_waiter waiter;
105}; 120};
106 121
107/* 122/*
108 * Split the global futex_lock into every hash list lock. 123 * Split the global futex_lock into every hash list lock.
109 */ 124 */
110struct futex_hash_bucket { 125struct futex_hash_bucket {
111 spinlock_t lock; 126 spinlock_t lock;
112 struct list_head chain; 127 struct plist_head chain;
113}; 128};
114 129
115static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 130static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
138 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
139} 154}
140 155
141/* 156/**
142 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
143 * 165 *
144 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
145 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
146 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
147 * 169 *
148 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
149 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
150 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
151 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
152 */ 173 */
153int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
154{ 176{
155 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
156 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
162 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
163 */ 185 */
164 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
165 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
166 return -EINVAL; 188 return -EINVAL;
167 address -= key->both.offset; 189 address -= key->both.offset;
168 190
169 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
170 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
171 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
172 */ 208 */
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
180 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 216 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
181 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 217 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
182 218
219 /* Save the user address in the ley */
220 key->uaddr = uaddr;
221
183 /* 222 /*
184 * Private mappings are handled in a simple way. 223 * Private mappings are handled in a simple way.
185 * 224 *
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
190 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
191 */ 230 */
192 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
193 key->private.mm = mm; 233 key->private.mm = mm;
194 key->private.address = address; 234 key->private.address = address;
195 return 0; 235 return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
199 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
200 */ 240 */
201 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
202 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
203 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
204 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
205 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
227 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
228 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
229 * 269 *
230 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
231 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
232 */ 270 */
233inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
234{ 272{
235 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
236 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
237 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
238 else 278 break;
279 case FUT_OFF_MMSHARED:
239 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
240 } 282 }
241} 283}
242EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
247 */ 289 */
248void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
249{ 291{
250 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
252 iput(key->shared.inode); 296 iput(key->shared.inode);
253 else 297 break;
298 case FUT_OFF_MMSHARED:
254 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
255 } 301 }
256} 302}
257EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
268} 314}
269 315
270/* 316/*
271 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
272 */ 319 */
273static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
274{ 322{
275 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
276 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
277 326
278 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
279 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
280 return -EFAULT;
281 329
282 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
283 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
284 current->min_flt++; 332 vma = find_vma(mm, address);
285 break; 333 if (vma && address >= vma->vm_start &&
286 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
287 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
288 break; 336 case VM_FAULT_MINOR:
289 default: 337 ret = 0;
290 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
291 } 345 }
292 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
293} 349}
294 350
295/* 351/*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
439} 495}
440 496
441static int 497static int
442lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) 498lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
499 union futex_key *key, struct futex_pi_state **ps)
443{ 500{
444 struct futex_pi_state *pi_state = NULL; 501 struct futex_pi_state *pi_state = NULL;
445 struct futex_q *this, *next; 502 struct futex_q *this, *next;
446 struct list_head *head; 503 struct plist_head *head;
447 struct task_struct *p; 504 struct task_struct *p;
448 pid_t pid; 505 pid_t pid;
449 506
450 head = &hb->chain; 507 head = &hb->chain;
451 508
452 list_for_each_entry_safe(this, next, head, list) { 509 plist_for_each_entry_safe(this, next, head, list) {
453 if (match_futex(&this->key, &me->key)) { 510 if (match_futex(&this->key, key)) {
454 /* 511 /*
455 * Another waiter already exists - bump up 512 * Another waiter already exists - bump up
456 * the refcount and return its pi_state: 513 * the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
465 WARN_ON(!atomic_read(&pi_state->refcount)); 522 WARN_ON(!atomic_read(&pi_state->refcount));
466 523
467 atomic_inc(&pi_state->refcount); 524 atomic_inc(&pi_state->refcount);
468 me->pi_state = pi_state; 525 *ps = pi_state;
469 526
470 return 0; 527 return 0;
471 } 528 }
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
492 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 549 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
493 550
494 /* Store the key for possible exit cleanups: */ 551 /* Store the key for possible exit cleanups: */
495 pi_state->key = me->key; 552 pi_state->key = *key;
496 553
497 spin_lock_irq(&p->pi_lock); 554 spin_lock_irq(&p->pi_lock);
498 WARN_ON(!list_empty(&pi_state->list)); 555 WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
502 559
503 put_task_struct(p); 560 put_task_struct(p);
504 561
505 me->pi_state = pi_state; 562 *ps = pi_state;
506 563
507 return 0; 564 return 0;
508} 565}
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
513 */ 570 */
514static void wake_futex(struct futex_q *q) 571static void wake_futex(struct futex_q *q)
515{ 572{
516 list_del_init(&q->list); 573 plist_del(&q->list, &q->list.plist);
517 if (q->filp) 574 if (q->filp)
518 send_sigio(&q->filp->f_owner, q->fd, POLL_IN); 575 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
519 /* 576 /*
520 * The lock in wake_up_all() is a crucial memory barrier after the 577 * The lock in wake_up_all() is a crucial memory barrier after the
521 * list_del_init() and also before assigning to q->lock_ptr. 578 * plist_del() and also before assigning to q->lock_ptr.
522 */ 579 */
523 wake_up_all(&q->waiters); 580 wake_up_all(&q->waiters);
524 /* 581 /*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
562 */ 619 */
563 if (!(uval & FUTEX_OWNER_DIED)) { 620 if (!(uval & FUTEX_OWNER_DIED)) {
564 newval = FUTEX_WAITERS | new_owner->pid; 621 newval = FUTEX_WAITERS | new_owner->pid;
622 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
623 newval |= (uval & FUTEX_WAITER_REQUEUED);
565 624
566 pagefault_disable(); 625 pagefault_disable();
567 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
629 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
630 * to this virtual address: 689 * to this virtual address:
631 */ 690 */
632static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
633{ 693{
634 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
635 struct futex_q *this, *next; 695 struct futex_q *this, *next;
636 struct list_head *head; 696 struct plist_head *head;
637 union futex_key key; 697 union futex_key key;
638 int ret; 698 int ret;
639 699
640 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
641 702
642 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
643 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
644 goto out; 705 goto out;
645 706
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
647 spin_lock(&hb->lock); 708 spin_lock(&hb->lock);
648 head = &hb->chain; 709 head = &hb->chain;
649 710
650 list_for_each_entry_safe(this, next, head, list) { 711 plist_for_each_entry_safe(this, next, head, list) {
651 if (match_futex (&this->key, &key)) { 712 if (match_futex (&this->key, &key)) {
652 if (this->pi_state) { 713 if (this->pi_state) {
653 ret = -EINVAL; 714 ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
661 722
662 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
663out: 724out:
664 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
727 return ret;
728}
729
730/*
731 * Called from futex_requeue_pi.
732 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
733 * PI-futex value; search its associated pi_state if an owner exist
734 * or create a new one without owner.
735 */
736static inline int
737lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
738 union futex_key *key,
739 struct futex_pi_state **pi_state)
740{
741 u32 curval, uval, newval;
742
743retry:
744 /*
745 * We can't handle a fault cleanly because we can't
746 * release the locks here. Simply return the fault.
747 */
748 if (get_futex_value_locked(&curval, uaddr))
749 return -EFAULT;
750
751 /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
752 if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
753 != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
754 /*
755 * No waiters yet, we prepare the futex to have some waiters.
756 */
757
758 uval = curval;
759 newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
760
761 pagefault_disable();
762 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
763 pagefault_enable();
764
765 if (unlikely(curval == -EFAULT))
766 return -EFAULT;
767 if (unlikely(curval != uval))
768 goto retry;
769 }
770
771 if (!(curval & FUTEX_TID_MASK)
772 || lookup_pi_state(curval, hb, key, pi_state)) {
773 /* the futex has no owner (yet) or the lookup failed:
774 allocate one pi_state without owner */
775
776 *pi_state = alloc_pi_state();
777
778 /* Already stores the key: */
779 (*pi_state)->key = *key;
780
781 /* init the mutex without owner */
782 __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
783 }
784
785 return 0;
786}
787
788/*
789 * Keep the first nr_wake waiter from futex1, wake up one,
790 * and requeue the next nr_requeue waiters following hashed on
791 * one physical page to another physical page (PI-futex uaddr2)
792 */
793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
796 int nr_wake, int nr_requeue, u32 *cmpval)
797{
798 union futex_key key1, key2;
799 struct futex_hash_bucket *hb1, *hb2;
800 struct plist_head *head1;
801 struct futex_q *this, *next;
802 struct futex_pi_state *pi_state2 = NULL;
803 struct rt_mutex_waiter *waiter, *top_waiter = NULL;
804 struct rt_mutex *lock2 = NULL;
805 int ret, drop_count = 0;
806
807 if (refill_pi_state_cache())
808 return -ENOMEM;
809
810retry:
811 /*
812 * First take all the futex related locks:
813 */
814 if (fshared)
815 down_read(fshared);
816
817 ret = get_futex_key(uaddr1, fshared, &key1);
818 if (unlikely(ret != 0))
819 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2);
821 if (unlikely(ret != 0))
822 goto out;
823
824 hb1 = hash_futex(&key1);
825 hb2 = hash_futex(&key2);
826
827 double_lock_hb(hb1, hb2);
828
829 if (likely(cmpval != NULL)) {
830 u32 curval;
831
832 ret = get_futex_value_locked(&curval, uaddr1);
833
834 if (unlikely(ret)) {
835 spin_unlock(&hb1->lock);
836 if (hb1 != hb2)
837 spin_unlock(&hb2->lock);
838
839 /*
840 * If we would have faulted, release mmap_sem, fault
841 * it in and start all over again.
842 */
843 if (fshared)
844 up_read(fshared);
845
846 ret = get_user(curval, uaddr1);
847
848 if (!ret)
849 goto retry;
850
851 return ret;
852 }
853 if (curval != *cmpval) {
854 ret = -EAGAIN;
855 goto out_unlock;
856 }
857 }
858
859 head1 = &hb1->chain;
860 plist_for_each_entry_safe(this, next, head1, list) {
861 if (!match_futex (&this->key, &key1))
862 continue;
863 if (++ret <= nr_wake) {
864 wake_futex(this);
865 } else {
866 /*
867 * FIRST: get and set the pi_state
868 */
869 if (!pi_state2) {
870 int s;
871 /* do this only the first time we requeue someone */
872 s = lookup_pi_state_for_requeue(uaddr2, hb2,
873 &key2, &pi_state2);
874 if (s) {
875 ret = s;
876 goto out_unlock;
877 }
878
879 lock2 = &pi_state2->pi_mutex;
880 spin_lock(&lock2->wait_lock);
881
882 /* Save the top waiter of the wait_list */
883 if (rt_mutex_has_waiters(lock2))
884 top_waiter = rt_mutex_top_waiter(lock2);
885 } else
886 atomic_inc(&pi_state2->refcount);
887
888
889 this->pi_state = pi_state2;
890
891 /*
892 * SECOND: requeue futex_q to the correct hashbucket
893 */
894
895 /*
896 * If key1 and key2 hash to the same bucket, no need to
897 * requeue.
898 */
899 if (likely(head1 != &hb2->chain)) {
900 plist_del(&this->list, &hb1->chain);
901 plist_add(&this->list, &hb2->chain);
902 this->lock_ptr = &hb2->lock;
903#ifdef CONFIG_DEBUG_PI_LIST
904 this->list.plist.lock = &hb2->lock;
905#endif
906 }
907 this->key = key2;
908 get_futex_key_refs(&key2);
909 drop_count++;
910
911
912 /*
913 * THIRD: queue it to lock2
914 */
915 spin_lock_irq(&this->task->pi_lock);
916 waiter = &this->waiter;
917 waiter->task = this->task;
918 waiter->lock = lock2;
919 plist_node_init(&waiter->list_entry, this->task->prio);
920 plist_node_init(&waiter->pi_list_entry, this->task->prio);
921 plist_add(&waiter->list_entry, &lock2->wait_list);
922 this->task->pi_blocked_on = waiter;
923 spin_unlock_irq(&this->task->pi_lock);
924
925 if (ret - nr_wake >= nr_requeue)
926 break;
927 }
928 }
929
930 /* If we've requeued some tasks and the top_waiter of the rt_mutex
931 has changed, we must adjust the priority of the owner, if any */
932 if (drop_count) {
933 struct task_struct *owner = rt_mutex_owner(lock2);
934 if (owner &&
935 (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
936 int chain_walk = 0;
937
938 spin_lock_irq(&owner->pi_lock);
939 if (top_waiter)
940 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
941 else
942 /*
943 * There was no waiters before the requeue,
944 * the flag must be updated
945 */
946 mark_rt_mutex_waiters(lock2);
947
948 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
949 __rt_mutex_adjust_prio(owner);
950 if (owner->pi_blocked_on) {
951 chain_walk = 1;
952 get_task_struct(owner);
953 }
954
955 spin_unlock_irq(&owner->pi_lock);
956 spin_unlock(&lock2->wait_lock);
957
958 if (chain_walk)
959 rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
960 current);
961 } else {
962 /* No owner or the top_waiter does not change */
963 mark_rt_mutex_waiters(lock2);
964 spin_unlock(&lock2->wait_lock);
965 }
966 }
967
968out_unlock:
969 spin_unlock(&hb1->lock);
970 if (hb1 != hb2)
971 spin_unlock(&hb2->lock);
972
973 /* drop_futex_key_refs() must be called outside the spinlocks. */
974 while (--drop_count >= 0)
975 drop_futex_key_refs(&key1);
976
977out:
978 if (fshared)
979 up_read(fshared);
665 return ret; 980 return ret;
666} 981}
667 982
@@ -670,22 +985,24 @@ out:
670 * to this virtual address: 985 * to this virtual address:
671 */ 986 */
672static int 987static int
673futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
674 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
675{ 991{
676 union futex_key key1, key2; 992 union futex_key key1, key2;
677 struct futex_hash_bucket *hb1, *hb2; 993 struct futex_hash_bucket *hb1, *hb2;
678 struct list_head *head; 994 struct plist_head *head;
679 struct futex_q *this, *next; 995 struct futex_q *this, *next;
680 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
681 997
682retryfull: 998retryfull:
683 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
684 1001
685 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
686 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
687 goto out; 1004 goto out;
688 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
689 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
690 goto out; 1007 goto out;
691 1008
@@ -725,11 +1042,10 @@ retry:
725 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
726 */ 1043 */
727 if (attempt++) { 1044 if (attempt++) {
728 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
729 attempt)) { 1046 fshared, attempt);
730 ret = -EFAULT; 1047 if (ret)
731 goto out; 1048 goto out;
732 }
733 goto retry; 1049 goto retry;
734 } 1050 }
735 1051
@@ -737,7 +1053,8 @@ retry:
737 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
738 * fault it in and start all over again. 1054 * fault it in and start all over again.
739 */ 1055 */
740 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
741 1058
742 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
743 if (ret) 1060 if (ret)
@@ -748,7 +1065,7 @@ retry:
748 1065
749 head = &hb1->chain; 1066 head = &hb1->chain;
750 1067
751 list_for_each_entry_safe(this, next, head, list) { 1068 plist_for_each_entry_safe(this, next, head, list) {
752 if (match_futex (&this->key, &key1)) { 1069 if (match_futex (&this->key, &key1)) {
753 wake_futex(this); 1070 wake_futex(this);
754 if (++ret >= nr_wake) 1071 if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
760 head = &hb2->chain; 1077 head = &hb2->chain;
761 1078
762 op_ret = 0; 1079 op_ret = 0;
763 list_for_each_entry_safe(this, next, head, list) { 1080 plist_for_each_entry_safe(this, next, head, list) {
764 if (match_futex (&this->key, &key2)) { 1081 if (match_futex (&this->key, &key2)) {
765 wake_futex(this); 1082 wake_futex(this);
766 if (++op_ret >= nr_wake2) 1083 if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
774 if (hb1 != hb2) 1091 if (hb1 != hb2)
775 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
776out: 1093out:
777 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
778 return ret; 1096 return ret;
779} 1097}
780 1098
@@ -782,22 +1100,24 @@ out:
782 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
783 * physical page. 1101 * physical page.
784 */ 1102 */
785static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
786 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
787{ 1106{
788 union futex_key key1, key2; 1107 union futex_key key1, key2;
789 struct futex_hash_bucket *hb1, *hb2; 1108 struct futex_hash_bucket *hb1, *hb2;
790 struct list_head *head1; 1109 struct plist_head *head1;
791 struct futex_q *this, *next; 1110 struct futex_q *this, *next;
792 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
793 1112
794 retry: 1113 retry:
795 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
796 1116
797 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
798 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
799 goto out; 1119 goto out;
800 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
801 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
802 goto out; 1122 goto out;
803 1123
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
820 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
821 * it in and start all over again. 1141 * it in and start all over again.
822 */ 1142 */
823 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
824 1145
825 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
826 1147
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
836 } 1157 }
837 1158
838 head1 = &hb1->chain; 1159 head1 = &hb1->chain;
839 list_for_each_entry_safe(this, next, head1, list) { 1160 plist_for_each_entry_safe(this, next, head1, list) {
840 if (!match_futex (&this->key, &key1)) 1161 if (!match_futex (&this->key, &key1))
841 continue; 1162 continue;
842 if (++ret <= nr_wake) { 1163 if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
847 * requeue. 1168 * requeue.
848 */ 1169 */
849 if (likely(head1 != &hb2->chain)) { 1170 if (likely(head1 != &hb2->chain)) {
850 list_move_tail(&this->list, &hb2->chain); 1171 plist_del(&this->list, &hb1->chain);
1172 plist_add(&this->list, &hb2->chain);
851 this->lock_ptr = &hb2->lock; 1173 this->lock_ptr = &hb2->lock;
852 } 1174#ifdef CONFIG_DEBUG_PI_LIST
1175 this->list.plist.lock = &hb2->lock;
1176#endif
1177 }
853 this->key = key2; 1178 this->key = key2;
854 get_futex_key_refs(&key2); 1179 get_futex_key_refs(&key2);
855 drop_count++; 1180 drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
869 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
870 1195
871out: 1196out:
872 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
873 return ret; 1199 return ret;
874} 1200}
875 1201
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
894 1220
895static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1221static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
896{ 1222{
897 list_add_tail(&q->list, &hb->chain); 1223 int prio;
1224
1225 /*
1226 * The priority used to register this element is
1227 * - either the real thread-priority for the real-time threads
1228 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1229 * - or MAX_RT_PRIO for non-RT threads.
1230 * Thus, all RT-threads are woken first in priority order, and
1231 * the others are woken last, in FIFO order.
1232 */
1233 prio = min(current->normal_prio, MAX_RT_PRIO);
1234
1235 plist_node_init(&q->list, prio);
1236#ifdef CONFIG_DEBUG_PI_LIST
1237 q->list.plist.lock = &hb->lock;
1238#endif
1239 plist_add(&q->list, &hb->chain);
898 q->task = current; 1240 q->task = current;
899 spin_unlock(&hb->lock); 1241 spin_unlock(&hb->lock);
900} 1242}
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
949 spin_unlock(lock_ptr); 1291 spin_unlock(lock_ptr);
950 goto retry; 1292 goto retry;
951 } 1293 }
952 WARN_ON(list_empty(&q->list)); 1294 WARN_ON(plist_node_empty(&q->list));
953 list_del(&q->list); 1295 plist_del(&q->list, &q->list.plist);
954 1296
955 BUG_ON(q->pi_state); 1297 BUG_ON(q->pi_state);
956 1298
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
964 1306
965/* 1307/*
966 * PI futexes can not be requeued and must remove themself from the 1308 * PI futexes can not be requeued and must remove themself from the
967 * hash bucket. The hash bucket lock is held on entry and dropped here. 1309 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1310 * and dropped here.
968 */ 1311 */
969static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) 1312static void unqueue_me_pi(struct futex_q *q)
970{ 1313{
971 WARN_ON(list_empty(&q->list)); 1314 WARN_ON(plist_node_empty(&q->list));
972 list_del(&q->list); 1315 plist_del(&q->list, &q->list.plist);
973 1316
974 BUG_ON(!q->pi_state); 1317 BUG_ON(!q->pi_state);
975 free_pi_state(q->pi_state); 1318 free_pi_state(q->pi_state);
976 q->pi_state = NULL; 1319 q->pi_state = NULL;
977 1320
978 spin_unlock(&hb->lock); 1321 spin_unlock(q->lock_ptr);
979 1322
980 drop_futex_key_refs(&q->key); 1323 drop_futex_key_refs(&q->key);
981} 1324}
982 1325
1326/*
1327 * Fixup the pi_state owner with current.
1328 *
1329 * The cur->mm semaphore must be held, it is released at return of this
1330 * function.
1331 */
1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1334 struct futex_hash_bucket *hb,
1335 struct task_struct *curr)
1336{
1337 u32 newtid = curr->pid | FUTEX_WAITERS;
1338 struct futex_pi_state *pi_state = q->pi_state;
1339 u32 uval, curval, newval;
1340 int ret;
1341
1342 /* Owner died? */
1343 if (pi_state->owner != NULL) {
1344 spin_lock_irq(&pi_state->owner->pi_lock);
1345 WARN_ON(list_empty(&pi_state->list));
1346 list_del_init(&pi_state->list);
1347 spin_unlock_irq(&pi_state->owner->pi_lock);
1348 } else
1349 newtid |= FUTEX_OWNER_DIED;
1350
1351 pi_state->owner = curr;
1352
1353 spin_lock_irq(&curr->pi_lock);
1354 WARN_ON(!list_empty(&pi_state->list));
1355 list_add(&pi_state->list, &curr->pi_state_list);
1356 spin_unlock_irq(&curr->pi_lock);
1357
1358 /* Unqueue and drop the lock */
1359 unqueue_me_pi(q);
1360 if (fshared)
1361 up_read(fshared);
1362 /*
1363 * We own it, so we have to replace the pending owner
1364 * TID. This must be atomic as we have preserve the
1365 * owner died bit here.
1366 */
1367 ret = get_user(uval, uaddr);
1368 while (!ret) {
1369 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1370 newval |= (uval & FUTEX_WAITER_REQUEUED);
1371 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1372 uval, newval);
1373 if (curval == -EFAULT)
1374 ret = -EFAULT;
1375 if (curval == uval)
1376 break;
1377 uval = curval;
1378 }
1379 return ret;
1380}
1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
983static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
984static int futex_wait_abstime(u32 __user *uaddr, u32 val, 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
985 int timed, unsigned long abs_time) 1390 u32 val, ktime_t *abs_time)
986{ 1391{
987 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
988 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
989 struct futex_hash_bucket *hb; 1394 struct futex_hash_bucket *hb;
990 struct futex_q q; 1395 struct futex_q q;
991 unsigned long time_left = 0;
992 u32 uval; 1396 u32 uval;
993 int ret; 1397 int ret;
1398 struct hrtimer_sleeper t, *to = NULL;
1399 int rem = 0;
994 1400
995 q.pi_state = NULL; 1401 q.pi_state = NULL;
996 retry: 1402 retry:
997 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
998 1405
999 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1000 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1001 goto out_release_sem; 1408 goto out_release_sem;
1002 1409
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1019 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1020 * rare, but normal. 1427 * rare, but normal.
1021 * 1428 *
1022 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1023 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1024 */ 1431 */
1025 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1026 1433
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1031 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1032 * start all over again. 1439 * start all over again.
1033 */ 1440 */
1034 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1035 1443
1036 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1037 1445
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1043 if (uval != val) 1451 if (uval != val)
1044 goto out_unlock_release_sem; 1452 goto out_unlock_release_sem;
1045 1453
1454 /*
1455 * This rt_mutex_waiter structure is prepared here and will
1456 * be used only if this task is requeued from a normal futex to
1457 * a PI-futex with futex_requeue_pi.
1458 */
1459 debug_rt_mutex_init_waiter(&q.waiter);
1460 q.waiter.task = NULL;
1461
1046 /* Only actually queue if *uaddr contained val. */ 1462 /* Only actually queue if *uaddr contained val. */
1047 __queue_me(&q, hb); 1463 __queue_me(&q, hb);
1048 1464
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1050 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1051 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1052 */ 1468 */
1053 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1054 1471
1055 /* 1472 /*
1056 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1065 __set_current_state(TASK_INTERRUPTIBLE); 1482 __set_current_state(TASK_INTERRUPTIBLE);
1066 add_wait_queue(&q.waiters, &wait); 1483 add_wait_queue(&q.waiters, &wait);
1067 /* 1484 /*
1068 * !list_empty() is safe here without any lock. 1485 * !plist_node_empty() is safe here without any lock.
1069 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1486 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1070 */ 1487 */
1071 time_left = 0; 1488 if (likely(!plist_node_empty(&q.list))) {
1072 if (likely(!list_empty(&q.list))) { 1489 if (!abs_time)
1073 unsigned long rel_time; 1490 schedule();
1074 1491 else {
1075 if (timed) { 1492 to = &t;
1076 unsigned long now = jiffies; 1493 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1077 if (time_after(now, abs_time)) 1494 hrtimer_init_sleeper(&t, current);
1078 rel_time = 0; 1495 t.timer.expires = *abs_time;
1079 else
1080 rel_time = abs_time - now;
1081 } else
1082 rel_time = MAX_SCHEDULE_TIMEOUT;
1083 1496
1084 time_left = schedule_timeout(rel_time); 1497 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
1498
1499 /*
1500 * the timer could have already expired, in which
1501 * case current would be flagged for rescheduling.
1502 * Don't bother calling schedule.
1503 */
1504 if (likely(t.task))
1505 schedule();
1506
1507 hrtimer_cancel(&t.timer);
1508
1509 /* Flag if a timeout occured */
1510 rem = (t.task == NULL);
1511 }
1085 } 1512 }
1086 __set_current_state(TASK_RUNNING); 1513 __set_current_state(TASK_RUNNING);
1087 1514
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1090 * we are the only user of it. 1517 * we are the only user of it.
1091 */ 1518 */
1092 1519
1520 if (q.pi_state) {
1521 /*
1522 * We were woken but have been requeued on a PI-futex.
1523 * We have to complete the lock acquisition by taking
1524 * the rtmutex.
1525 */
1526
1527 struct rt_mutex *lock = &q.pi_state->pi_mutex;
1528
1529 spin_lock(&lock->wait_lock);
1530 if (unlikely(q.waiter.task)) {
1531 remove_waiter(lock, &q.waiter);
1532 }
1533 spin_unlock(&lock->wait_lock);
1534
1535 if (rem)
1536 ret = -ETIMEDOUT;
1537 else
1538 ret = rt_mutex_timed_lock(lock, to, 1);
1539
1540 if (fshared)
1541 down_read(fshared);
1542 spin_lock(q.lock_ptr);
1543
1544 /*
1545 * Got the lock. We might not be the anticipated owner if we
1546 * did a lock-steal - fix up the PI-state in that case.
1547 */
1548 if (!ret && q.pi_state->owner != curr) {
1549 /*
1550 * We MUST play with the futex we were requeued on,
1551 * NOT the current futex.
1552 * We can retrieve it from the key of the pi_state
1553 */
1554 uaddr = q.pi_state->key.uaddr;
1555
1556 /* mmap_sem and hash_bucket lock are unlocked at
1557 return of this function */
1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1560 } else {
1561 /*
1562 * Catch the rare case, where the lock was released
1563 * when we were on the way back before we locked
1564 * the hash bucket.
1565 */
1566 if (ret && q.pi_state->owner == curr) {
1567 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1568 ret = 0;
1569 }
1570 /* Unqueue and drop the lock */
1571 unqueue_me_pi(&q);
1572 if (fshared)
1573 up_read(fshared);
1574 }
1575
1576 debug_rt_mutex_free_waiter(&q.waiter);
1577
1578 return ret;
1579 }
1580
1581 debug_rt_mutex_free_waiter(&q.waiter);
1582
1093 /* If we were woken (and unqueued), we succeeded, whatever. */ 1583 /* If we were woken (and unqueued), we succeeded, whatever. */
1094 if (!unqueue_me(&q)) 1584 if (!unqueue_me(&q))
1095 return 0; 1585 return 0;
1096 if (time_left == 0) 1586 if (rem)
1097 return -ETIMEDOUT; 1587 return -ETIMEDOUT;
1098 1588
1099 /* 1589 /*
1100 * We expect signal_pending(current), but another thread may 1590 * We expect signal_pending(current), but another thread may
1101 * have handled it for us already. 1591 * have handled it for us already.
1102 */ 1592 */
1103 if (time_left == MAX_SCHEDULE_TIMEOUT) 1593 if (!abs_time)
1104 return -ERESTARTSYS; 1594 return -ERESTARTSYS;
1105 else { 1595 else {
1106 struct restart_block *restart; 1596 struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1108 restart->fn = futex_wait_restart; 1598 restart->fn = futex_wait_restart;
1109 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1110 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1111 restart->arg2 = (unsigned long)timed; 1601 restart->arg2 = (unsigned long)abs_time;
1112 restart->arg3 = abs_time; 1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1113 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1114 } 1606 }
1115 1607
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1117 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1118 1610
1119 out_release_sem: 1611 out_release_sem:
1120 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1121 return ret; 1614 return ret;
1122} 1615}
1123 1616
1124static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
1125{
1126 int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
1127 return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
1128}
1129 1617
1130static long futex_wait_restart(struct restart_block *restart) 1618static long futex_wait_restart(struct restart_block *restart)
1131{ 1619{
1132 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1133 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1134 int timed = (int)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1135 unsigned long abs_time = restart->arg3; 1623 struct rw_semaphore *fshared = NULL;
1136 1624
1137 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1138 return (long)futex_wait_abstime(uaddr, val, timed, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1139} 1629}
1140 1630
1141 1631
1632static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1633 union futex_key *key, struct task_struct *p)
1634{
1635 struct plist_head *head;
1636 struct futex_q *this, *next;
1637 struct futex_pi_state *pi_state = NULL;
1638 struct rt_mutex *lock;
1639
1640 /* Search a waiter that should already exists */
1641
1642 head = &hb->chain;
1643
1644 plist_for_each_entry_safe(this, next, head, list) {
1645 if (match_futex (&this->key, key)) {
1646 pi_state = this->pi_state;
1647 break;
1648 }
1649 }
1650
1651 BUG_ON(!pi_state);
1652
1653 /* set p as pi_state's owner */
1654 lock = &pi_state->pi_mutex;
1655
1656 spin_lock(&lock->wait_lock);
1657 spin_lock_irq(&p->pi_lock);
1658
1659 list_add(&pi_state->list, &p->pi_state_list);
1660 pi_state->owner = p;
1661
1662
1663 /* set p as pi_mutex's owner */
1664 debug_rt_mutex_proxy_lock(lock, p);
1665 WARN_ON(rt_mutex_owner(lock));
1666 rt_mutex_set_owner(lock, p, 0);
1667 rt_mutex_deadlock_account_lock(lock, p);
1668
1669 plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
1670 &p->pi_waiters);
1671 __rt_mutex_adjust_prio(p);
1672
1673 spin_unlock_irq(&p->pi_lock);
1674 spin_unlock(&lock->wait_lock);
1675}
1676
1142/* 1677/*
1143 * Userspace tried a 0 -> TID atomic transition of the futex value 1678 * Userspace tried a 0 -> TID atomic transition of the futex value
1144 * and failed. The kernel side here does the whole locking operation: 1679 * and failed. The kernel side here does the whole locking operation:
1145 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1146 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1147 */ 1682 */
1148static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1149 long nsec, int trylock) 1684 int detect, ktime_t *time, int trylock)
1150{ 1685{
1151 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1152 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
1153 struct futex_hash_bucket *hb; 1688 struct futex_hash_bucket *hb;
1154 u32 uval, newval, curval; 1689 u32 uval, newval, curval;
1155 struct futex_q q; 1690 struct futex_q q;
1156 int ret, attempt = 0; 1691 int ret, lock_held, attempt = 0;
1157 1692
1158 if (refill_pi_state_cache()) 1693 if (refill_pi_state_cache())
1159 return -ENOMEM; 1694 return -ENOMEM;
1160 1695
1161 if (sec != MAX_SCHEDULE_TIMEOUT) { 1696 if (time) {
1162 to = &timeout; 1697 to = &timeout;
1163 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1698 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1164 hrtimer_init_sleeper(to, current); 1699 hrtimer_init_sleeper(to, current);
1165 to->timer.expires = ktime_set(sec, nsec); 1700 to->timer.expires = *time;
1166 } 1701 }
1167 1702
1168 q.pi_state = NULL; 1703 q.pi_state = NULL;
1169 retry: 1704 retry:
1170 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1171 1707
1172 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1173 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1174 goto out_release_sem; 1710 goto out_release_sem;
1175 1711
1176 hb = queue_lock(&q, -1, NULL); 1712 hb = queue_lock(&q, -1, NULL);
1177 1713
1178 retry_locked: 1714 retry_locked:
1715 lock_held = 0;
1716
1179 /* 1717 /*
1180 * To avoid races, we attempt to take the lock here again 1718 * To avoid races, we attempt to take the lock here again
1181 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1719 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1194 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1732 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1195 if (!detect && 0) 1733 if (!detect && 0)
1196 force_sig(SIGKILL, current); 1734 force_sig(SIGKILL, current);
1197 ret = -EDEADLK; 1735 /*
1736 * Normally, this check is done in user space.
1737 * In case of requeue, the owner may attempt to lock this futex,
1738 * even if the ownership has already been given by the previous
1739 * waker.
1740 * In the usual case, this is a case of deadlock, but not in case
1741 * of REQUEUE_PI.
1742 */
1743 if (!(curval & FUTEX_WAITER_REQUEUED))
1744 ret = -EDEADLK;
1198 goto out_unlock_release_sem; 1745 goto out_unlock_release_sem;
1199 } 1746 }
1200 1747
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1206 goto out_unlock_release_sem; 1753 goto out_unlock_release_sem;
1207 1754
1208 uval = curval; 1755 uval = curval;
1209 newval = uval | FUTEX_WAITERS; 1756 /*
1757 * In case of a requeue, check if there already is an owner
1758 * If not, just take the futex.
1759 */
1760 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
1761 /* set current as futex owner */
1762 newval = curval | current->pid;
1763 lock_held = 1;
1764 } else
1765 /* Set the WAITERS flag, so the owner will know it has someone
1766 to wake at next unlock */
1767 newval = curval | FUTEX_WAITERS;
1210 1768
1211 pagefault_disable(); 1769 pagefault_disable();
1212 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1770 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1217 if (unlikely(curval != uval)) 1775 if (unlikely(curval != uval))
1218 goto retry_locked; 1776 goto retry_locked;
1219 1777
1778 if (lock_held) {
1779 set_pi_futex_owner(hb, &q.key, curr);
1780 goto out_unlock_release_sem;
1781 }
1782
1220 /* 1783 /*
1221 * We dont have the lock. Look up the PI state (or create it if 1784 * We dont have the lock. Look up the PI state (or create it if
1222 * we are the first waiter): 1785 * we are the first waiter):
1223 */ 1786 */
1224 ret = lookup_pi_state(uval, hb, &q); 1787 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1225 1788
1226 if (unlikely(ret)) { 1789 if (unlikely(ret)) {
1227 /* 1790 /*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1263 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1264 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1265 */ 1828 */
1266 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1267 1831
1268 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1269 /* 1833 /*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1277 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1278 } 1842 }
1279 1843
1280 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1281 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1282 1847
1283 /* 1848 /*
1284 * Got the lock. We might not be the anticipated owner if we 1849 * Got the lock. We might not be the anticipated owner if we
1285 * did a lock-steal - fix up the PI-state in that case. 1850 * did a lock-steal - fix up the PI-state in that case.
1286 */ 1851 */
1287 if (!ret && q.pi_state->owner != curr) { 1852 if (!ret && q.pi_state->owner != curr)
1288 u32 newtid = current->pid | FUTEX_WAITERS; 1853 /* mmap_sem is unlocked at return of this function */
1289 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1290 /* Owner died? */ 1855 else {
1291 if (q.pi_state->owner != NULL) {
1292 spin_lock_irq(&q.pi_state->owner->pi_lock);
1293 WARN_ON(list_empty(&q.pi_state->list));
1294 list_del_init(&q.pi_state->list);
1295 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1296 } else
1297 newtid |= FUTEX_OWNER_DIED;
1298
1299 q.pi_state->owner = current;
1300
1301 spin_lock_irq(&current->pi_lock);
1302 WARN_ON(!list_empty(&q.pi_state->list));
1303 list_add(&q.pi_state->list, &current->pi_state_list);
1304 spin_unlock_irq(&current->pi_lock);
1305
1306 /* Unqueue and drop the lock */
1307 unqueue_me_pi(&q, hb);
1308 up_read(&curr->mm->mmap_sem);
1309 /*
1310 * We own it, so we have to replace the pending owner
1311 * TID. This must be atomic as we have preserve the
1312 * owner died bit here.
1313 */
1314 ret = get_user(uval, uaddr);
1315 while (!ret) {
1316 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1317 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1318 uval, newval);
1319 if (curval == -EFAULT)
1320 ret = -EFAULT;
1321 if (curval == uval)
1322 break;
1323 uval = curval;
1324 }
1325 } else {
1326 /* 1856 /*
1327 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
1328 * when we were on the way back before we locked 1858 * when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1333 ret = 0; 1863 ret = 0;
1334 } 1864 }
1335 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1336 unqueue_me_pi(&q, hb); 1866 unqueue_me_pi(&q);
1337 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1338 } 1869 }
1339 1870
1340 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1346 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1347 1878
1348 out_release_sem: 1879 out_release_sem:
1349 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1350 return ret; 1882 return ret;
1351 1883
1352 uaddr_faulted: 1884 uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1357 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1358 */ 1890 */
1359 if (attempt++) { 1891 if (attempt++) {
1360 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1361 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1362 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1363 }
1364 goto retry_locked; 1896 goto retry_locked;
1365 } 1897 }
1366 1898
1367 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1368 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1369 1902
1370 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1371 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1379 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1380 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1381 */ 1914 */
1382static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1383{ 1916{
1384 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1385 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
1386 u32 uval; 1919 u32 uval;
1387 struct list_head *head; 1920 struct plist_head *head;
1388 union futex_key key; 1921 union futex_key key;
1389 int ret, attempt = 0; 1922 int ret, attempt = 0;
1390 1923
@@ -1399,9 +1932,10 @@ retry:
1399 /* 1932 /*
1400 * First take all the futex related locks: 1933 * First take all the futex related locks:
1401 */ 1934 */
1402 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1403 1937
1404 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1405 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1406 goto out; 1940 goto out;
1407 1941
@@ -1435,7 +1969,7 @@ retry_locked:
1435 */ 1969 */
1436 head = &hb->chain; 1970 head = &hb->chain;
1437 1971
1438 list_for_each_entry_safe(this, next, head, list) { 1972 plist_for_each_entry_safe(this, next, head, list) {
1439 if (!match_futex (&this->key, &key)) 1973 if (!match_futex (&this->key, &key))
1440 continue; 1974 continue;
1441 ret = wake_futex_pi(uaddr, uval, this); 1975 ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
1460out_unlock: 1994out_unlock:
1461 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1462out: 1996out:
1463 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1464 1999
1465 return ret; 2000 return ret;
1466 2001
@@ -1472,15 +2007,16 @@ pi_faulted:
1472 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1473 */ 2008 */
1474 if (attempt++) { 2009 if (attempt++) {
1475 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1476 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1477 goto out_unlock; 2013 goto out_unlock;
1478 }
1479 goto retry_locked; 2014 goto retry_locked;
1480 } 2015 }
1481 2016
1482 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1483 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1484 2020
1485 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1486 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
1509 poll_wait(filp, &q->waiters, wait); 2045 poll_wait(filp, &q->waiters, wait);
1510 2046
1511 /* 2047 /*
1512 * list_empty() is safe here without any lock. 2048 * plist_node_empty() is safe here without any lock.
1513 * q->lock_ptr != 0 is not safe, because of ordering against wakeup. 2049 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1514 */ 2050 */
1515 if (list_empty(&q->list)) 2051 if (plist_node_empty(&q->list))
1516 ret = POLLIN | POLLRDNORM; 2052 ret = POLLIN | POLLRDNORM;
1517 2053
1518 return ret; 2054 return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1532 struct futex_q *q; 2068 struct futex_q *q;
1533 struct file *filp; 2069 struct file *filp;
1534 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1535 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1536 2073
1537 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
1573 } 2110 }
1574 q->pi_state = NULL; 2111 q->pi_state = NULL;
1575 2112
1576 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
1577 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
1578 2116
1579 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
1580 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
1581 kfree(q); 2119 kfree(q);
1582 goto error; 2120 goto error;
1583 } 2121 }
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1589 filp->private_data = q; 2127 filp->private_data = q;
1590 2128
1591 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
1592 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
1593 2131
1594 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
1595 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
1702 * userspace. 2240 * userspace.
1703 */ 2241 */
1704 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2242 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2243 /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
2244 mval |= (uval & FUTEX_WAITER_REQUEUED);
1705 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2245 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1706 2246
1707 if (nval == -EFAULT) 2247 if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
1716 */ 2256 */
1717 if (!pi) { 2257 if (!pi) {
1718 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
1719 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1720 } 2260 }
1721 } 2261 }
1722 return 0; 2262 return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
1772 return; 2312 return;
1773 2313
1774 if (pending) 2314 if (pending)
1775 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
1776 2317
1777 while (entry != &head->list) { 2318 while (entry != &head->list) {
1778 /* 2319 /*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
1798 } 2339 }
1799} 2340}
1800 2341
1801long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, 2342long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1802 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
1803{ 2344{
1804 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
1805 2351
1806 switch (op) { 2352 switch (cmd) {
1807 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
1808 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
1809 break; 2355 break;
1810 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
1811 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
1812 break; 2358 break;
1813 case FUTEX_FD: 2359 case FUTEX_FD:
1814 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
1815 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
1816 break; 2362 break;
1817 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
1818 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
1819 break; 2365 break;
1820 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
1821 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
1822 break; 2368 break;
1823 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
1824 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
1825 break; 2371 break;
1826 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
1827 ret = futex_lock_pi(uaddr, val, timeout, val2, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
1828 break; 2374 break;
1829 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
1830 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
1831 break; 2377 break;
1832 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
1833 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2380 break;
2381 case FUTEX_CMP_REQUEUE_PI:
2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
1834 break; 2383 break;
1835 default: 2384 default:
1836 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1843 struct timespec __user *utime, u32 __user *uaddr2, 2392 struct timespec __user *utime, u32 __user *uaddr2,
1844 u32 val3) 2393 u32 val3)
1845{ 2394{
1846 struct timespec t; 2395 struct timespec ts;
1847 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 2396 ktime_t t, *tp = NULL;
1848 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
1849 2399
1850 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
1851 if (copy_from_user(&t, utime, sizeof(t)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1852 return -EFAULT; 2402 return -EFAULT;
1853 if (!timespec_valid(&t)) 2403 if (!timespec_valid(&ts))
1854 return -EINVAL; 2404 return -EINVAL;
1855 if (op == FUTEX_WAIT) 2405
1856 timeout = timespec_to_jiffies(&t) + 1; 2406 t = timespec_to_ktime(ts);
1857 else { 2407 if (cmd == FUTEX_WAIT)
1858 timeout = t.tv_sec; 2408 t = ktime_add(ktime_get(), t);
1859 val2 = t.tv_nsec; 2409 tp = &t;
1860 }
1861 } 2410 }
1862 /* 2411 /*
1863 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
1864 */ 2413 */
1865 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2415 || cmd == FUTEX_CMP_REQUEUE_PI)
1866 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
1867 2417
1868 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
1869} 2419}
1870 2420
1871static int futexfs_get_sb(struct file_system_type *fs_type, 2421static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
1895 } 2445 }
1896 2446
1897 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2447 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1898 INIT_LIST_HEAD(&futex_queues[i].chain); 2448 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
1899 spin_lock_init(&futex_queues[i].lock); 2449 spin_lock_init(&futex_queues[i].lock);
1900 } 2450 }
1901 return 0; 2451 return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
141 struct compat_timespec __user *utime, u32 __user *uaddr2, 141 struct compat_timespec __user *utime, u32 __user *uaddr2,
142 u32 val3) 142 u32 val3)
143{ 143{
144 struct timespec t; 144 struct timespec ts;
145 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 145 ktime_t t, *tp = NULL;
146 int val2 = 0; 146 int val2 = 0;
147 147
148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
149 if (get_compat_timespec(&t, utime)) 149 if (get_compat_timespec(&ts, utime))
150 return -EFAULT; 150 return -EFAULT;
151 if (!timespec_valid(&t)) 151 if (!timespec_valid(&ts))
152 return -EINVAL; 152 return -EINVAL;
153
154 t = timespec_to_ktime(ts);
153 if (op == FUTEX_WAIT) 155 if (op == FUTEX_WAIT)
154 timeout = timespec_to_jiffies(&t) + 1; 156 t = ktime_add(ktime_get(), t);
155 else { 157 tp = &t;
156 timeout = t.tv_sec;
157 val2 = t.tv_nsec;
158 }
159 } 158 }
160 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 159 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
160 || op == FUTEX_CMP_REQUEUE_PI)
161 val2 = (int) (unsigned long) utime; 161 val2 = (int) (unsigned long) utime;
162 162
163 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 163 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
164} 164}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1411 switch (action) { 1411 switch (action) {
1412 1412
1413 case CPU_UP_PREPARE: 1413 case CPU_UP_PREPARE:
1414 case CPU_UP_PREPARE_FROZEN:
1414 init_hrtimers_cpu(cpu); 1415 init_hrtimers_cpu(cpu);
1415 break; 1416 break;
1416 1417
1417#ifdef CONFIG_HOTPLUG_CPU 1418#ifdef CONFIG_HOTPLUG_CPU
1418 case CPU_DEAD: 1419 case CPU_DEAD:
1420 case CPU_DEAD_FROZEN:
1419 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1421 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
1420 migrate_hrtimers(cpu); 1422 migrate_hrtimers(cpu);
1421 break; 1423 break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d1..e391cbb1f566 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
22 * handle_bad_irq - handle spurious and unhandled irqs 22 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 23 * @irq: the interrupt number
24 * @desc: description of the interrupt 24 * @desc: description of the interrupt
25 * @regs: pointer to a register structure
26 * 25 *
27 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
28 */ 27 */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8d..4d32eb077179 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
135 135
136 /* Unblock all signals and set the session keyring. */ 136 /* Unblock all signals and set the session keyring. */
137 new_session = key_get(sub_info->ring); 137 new_session = key_get(sub_info->ring);
138 flush_signals(current);
139 spin_lock_irq(&current->sighand->siglock); 138 spin_lock_irq(&current->sighand->siglock);
140 old_session = __install_session_keyring(current, new_session); 139 old_session = __install_session_keyring(current, new_session);
141 flush_signal_handlers(current, 1); 140 flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@ static int wait_for_helper(void *data)
186{ 185{
187 struct subprocess_info *sub_info = data; 186 struct subprocess_info *sub_info = data;
188 pid_t pid; 187 pid_t pid;
189 struct k_sigaction sa;
190 188
191 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 189 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
192 * populate the status, but will return -ECHILD. */ 190 * populate the status, but will return -ECHILD. */
193 sa.sa.sa_handler = SIG_IGN;
194 sa.sa.sa_flags = 0;
195 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
196 do_sigaction(SIGCHLD, &sa, NULL);
197 allow_signal(SIGCHLD); 191 allow_signal(SIGCHLD);
198 192
199 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 193 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
1/* Kernel thread helper functions. 1/* Kernel thread helper functions.
2 * Copyright (C) 2004 IBM Corporation, Rusty Russell. 2 * Copyright (C) 2004 IBM Corporation, Rusty Russell.
3 * 3 *
4 * Creation is done via keventd, so that we get a clean environment 4 * Creation is done via kthreadd, so that we get a clean environment
5 * even if we're invoked from userspace (think modprobe, hotplug cpu, 5 * even if we're invoked from userspace (think modprobe, hotplug cpu,
6 * etc.). 6 * etc.).
7 */ 7 */
@@ -15,24 +15,22 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18/* 18static DEFINE_SPINLOCK(kthread_create_lock);
19 * We dont want to execute off keventd since it might 19static LIST_HEAD(kthread_create_list);
20 * hold a semaphore our callers hold too: 20struct task_struct *kthreadd_task;
21 */
22static struct workqueue_struct *helper_wq;
23 21
24struct kthread_create_info 22struct kthread_create_info
25{ 23{
26 /* Information passed to kthread() from keventd. */ 24 /* Information passed to kthread() from kthreadd. */
27 int (*threadfn)(void *data); 25 int (*threadfn)(void *data);
28 void *data; 26 void *data;
29 struct completion started; 27 struct completion started;
30 28
31 /* Result passed back to kthread_create() from keventd. */ 29 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 30 struct task_struct *result;
33 struct completion done; 31 struct completion done;
34 32
35 struct work_struct work; 33 struct list_head list;
36}; 34};
37 35
38struct kthread_stop_info 36struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
60} 58}
61EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
62 60
63static void kthread_exit_files(void)
64{
65 struct fs_struct *fs;
66 struct task_struct *tsk = current;
67
68 exit_fs(tsk); /* current->fs->count--; */
69 fs = init_task.fs;
70 tsk->fs = fs;
71 atomic_inc(&fs->count);
72 exit_files(tsk);
73 current->files = init_task.files;
74 atomic_inc(&tsk->files->count);
75}
76
77static int kthread(void *_create) 61static int kthread(void *_create)
78{ 62{
79 struct kthread_create_info *create = _create; 63 struct kthread_create_info *create = _create;
80 int (*threadfn)(void *data); 64 int (*threadfn)(void *data);
81 void *data; 65 void *data;
82 sigset_t blocked;
83 int ret = -EINTR; 66 int ret = -EINTR;
84 67
85 kthread_exit_files(); 68 /* Copy data: it's on kthread's stack */
86
87 /* Copy data: it's on keventd's stack */
88 threadfn = create->threadfn; 69 threadfn = create->threadfn;
89 data = create->data; 70 data = create->data;
90 71
91 /* Block and flush all signals (in case we're not from keventd). */
92 sigfillset(&blocked);
93 sigprocmask(SIG_BLOCK, &blocked, NULL);
94 flush_signals(current);
95
96 /* By default we can run anywhere, unlike keventd. */
97 set_cpus_allowed(current, CPU_MASK_ALL);
98
99 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
100 __set_current_state(TASK_INTERRUPTIBLE); 73 __set_current_state(TASK_INTERRUPTIBLE);
101 complete(&create->started); 74 complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
112 return 0; 85 return 0;
113} 86}
114 87
115/* We are keventd: create a thread. */ 88static void create_kthread(struct kthread_create_info *create)
116static void keventd_create_kthread(struct work_struct *work)
117{ 89{
118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
120 int pid; 90 int pid;
121 91
122 /* We want our own signal handler (we take no signals by default). */ 92 /* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
162 create.data = data; 132 create.data = data;
163 init_completion(&create.started); 133 init_completion(&create.started);
164 init_completion(&create.done); 134 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread); 135
166 136 spin_lock(&kthread_create_lock);
167 /* 137 list_add_tail(&create.list, &kthread_create_list);
168 * The workqueue needs to start up first: 138 wake_up_process(kthreadd_task);
169 */ 139 spin_unlock(&kthread_create_lock);
170 if (!helper_wq) 140
171 create.work.func(&create.work); 141 wait_for_completion(&create.done);
172 else { 142
173 queue_work(helper_wq, &create.work);
174 wait_for_completion(&create.done);
175 }
176 if (!IS_ERR(create.result)) { 143 if (!IS_ERR(create.result)) {
177 va_list args; 144 va_list args;
178 va_start(args, namefmt); 145 va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
180 namefmt, args); 147 namefmt, args);
181 va_end(args); 148 va_end(args);
182 } 149 }
183
184 return create.result; 150 return create.result;
185} 151}
186EXPORT_SYMBOL(kthread_create); 152EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
245} 211}
246EXPORT_SYMBOL(kthread_stop); 212EXPORT_SYMBOL(kthread_stop);
247 213
248static __init int helper_init(void) 214
215static __init void kthreadd_setup(void)
249{ 216{
250 helper_wq = create_singlethread_workqueue("kthread"); 217 struct task_struct *tsk = current;
251 BUG_ON(!helper_wq);
252 218
253 return 0; 219 set_task_comm(tsk, "kthreadd");
220
221 ignore_signals(tsk);
222
223 set_user_nice(tsk, -5);
224 set_cpus_allowed(tsk, CPU_MASK_ALL);
254} 225}
255 226
256core_initcall(helper_init); 227int kthreadd(void *unused)
228{
229 /* Setup a clean context for our children to inherit. */
230 kthreadd_setup();
231
232 current->flags |= PF_NOFREEZE;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236 if (list_empty(&kthread_create_list))
237 schedule();
238 __set_current_state(TASK_RUNNING);
239
240 spin_lock(&kthread_create_lock);
241 while (!list_empty(&kthread_create_list)) {
242 struct kthread_create_info *create;
243
244 create = list_entry(kthread_create_list.next,
245 struct kthread_create_info, list);
246 list_del_init(&create->list);
247 spin_unlock(&kthread_create_lock);
248
249 create_kthread(create);
250
251 spin_lock(&kthread_create_lock);
252 }
253 spin_unlock(&kthread_create_lock);
254 }
255
256 return 0;
257}
diff --git a/kernel/module.c b/kernel/module.c
index d36e45477fac..9bd93de01f4a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -96,9 +96,9 @@ static inline void add_taint_module(struct module *mod, unsigned flag)
96 mod->taints |= flag; 96 mod->taints |= flag;
97} 97}
98 98
99/* A thread that wants to hold a reference to a module only while it 99/*
100 * is running can call ths to safely exit. 100 * A thread that wants to hold a reference to a module only while it
101 * nfsd and lockd use this. 101 * is running can call this to safely exit. nfsd and lockd use this.
102 */ 102 */
103void __module_put_and_exit(struct module *mod, long code) 103void __module_put_and_exit(struct module *mod, long code)
104{ 104{
@@ -1199,7 +1199,7 @@ static int __unlink_module(void *_mod)
1199 return 0; 1199 return 0;
1200} 1200}
1201 1201
1202/* Free a module, remove from lists, etc (must hold module mutex). */ 1202/* Free a module, remove from lists, etc (must hold module_mutex). */
1203static void free_module(struct module *mod) 1203static void free_module(struct module *mod)
1204{ 1204{
1205 /* Delete from various lists */ 1205 /* Delete from various lists */
@@ -1246,7 +1246,7 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1246 1246
1247/* 1247/*
1248 * Ensure that an exported symbol [global namespace] does not already exist 1248 * Ensure that an exported symbol [global namespace] does not already exist
1249 * in the Kernel or in some other modules exported symbol table. 1249 * in the kernel or in some other module's exported symbol table.
1250 */ 1250 */
1251static int verify_export_symbols(struct module *mod) 1251static int verify_export_symbols(struct module *mod)
1252{ 1252{
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765b..303eab18484b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
133 133
134 debug_mutex_lock_common(lock, &waiter); 134 debug_mutex_lock_common(lock, &waiter);
135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
136 debug_mutex_add_waiter(lock, &waiter, task->thread_info); 136 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
137 137
138 /* add waiting tasks to the end of the waitqueue (FIFO): */ 138 /* add waiting tasks to the end of the waitqueue (FIFO): */
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
159 */ 159 */
160 if (unlikely(state == TASK_INTERRUPTIBLE && 160 if (unlikely(state == TASK_INTERRUPTIBLE &&
161 signal_pending(task))) { 161 signal_pending(task))) {
162 mutex_remove_waiter(lock, &waiter, task->thread_info); 162 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
163 mutex_release(&lock->dep_map, 1, _RET_IP_); 163 mutex_release(&lock->dep_map, 1, _RET_IP_);
164 spin_unlock_mutex(&lock->wait_lock, flags); 164 spin_unlock_mutex(&lock->wait_lock, flags);
165 165
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
175 } 175 }
176 176
177 /* got the lock - rejoice! */ 177 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task->thread_info); 178 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task->thread_info); 179 debug_mutex_set_owner(lock, task_thread_info(task));
180 180
181 /* set it to 0 if there are no waiters left: */ 181 /* set it to 0 if there are no waiters left: */
182 if (likely(list_empty(&lock->wait_list))) 182 if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
30dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block; 31sector_t swsusp_resume_block;
32 32
33enum {
34 HIBERNATION_INVALID,
35 HIBERNATION_PLATFORM,
36 HIBERNATION_TEST,
37 HIBERNATION_TESTPROC,
38 HIBERNATION_SHUTDOWN,
39 HIBERNATION_REBOOT,
40 /* keep last */
41 __HIBERNATION_AFTER_LAST
42};
43#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
44#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
45
46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47
48struct hibernation_ops *hibernation_ops;
49
50/**
51 * hibernation_set_ops - set the global hibernate operations
52 * @ops: the hibernation operations to use in subsequent hibernation transitions
53 */
54
55void hibernation_set_ops(struct hibernation_ops *ops)
56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) {
58 WARN_ON(1);
59 return;
60 }
61 mutex_lock(&pm_mutex);
62 hibernation_ops = ops;
63 if (ops)
64 hibernation_mode = HIBERNATION_PLATFORM;
65 else if (hibernation_mode == HIBERNATION_PLATFORM)
66 hibernation_mode = HIBERNATION_SHUTDOWN;
67
68 mutex_unlock(&pm_mutex);
69}
70
71
33/** 72/**
34 * platform_prepare - prepare the machine for hibernation using the 73 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails 74 * platform driver if so configured and return an error code if it fails
36 */ 75 */
37 76
38static inline int platform_prepare(void) 77static int platform_prepare(void)
39{ 78{
40 int error = 0; 79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
80 hibernation_ops->prepare() : 0;
81}
41 82
42 switch (pm_disk_mode) { 83/**
43 case PM_DISK_TEST: 84 * platform_finish - switch the machine to the normal mode of operation
44 case PM_DISK_TESTPROC: 85 * using the platform driver (must be called after platform_prepare())
45 case PM_DISK_SHUTDOWN: 86 */
46 case PM_DISK_REBOOT: 87
47 break; 88static void platform_finish(void)
48 default: 89{
49 if (pm_ops && pm_ops->prepare) 90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
50 error = pm_ops->prepare(PM_SUSPEND_DISK); 91 hibernation_ops->finish();
51 }
52 return error;
53} 92}
54 93
55/** 94/**
56 * power_down - Shut machine down for hibernate. 95 * power_down - Shut the machine down for hibernation.
57 * 96 *
58 * Use the platform driver, if configured so; otherwise try 97 * Use the platform driver, if configured so; otherwise try
59 * to power off or reboot. 98 * to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
61 100
62static void power_down(void) 101static void power_down(void)
63{ 102{
64 switch (pm_disk_mode) { 103 switch (hibernation_mode) {
65 case PM_DISK_TEST: 104 case HIBERNATION_TEST:
66 case PM_DISK_TESTPROC: 105 case HIBERNATION_TESTPROC:
67 break; 106 break;
68 case PM_DISK_SHUTDOWN: 107 case HIBERNATION_SHUTDOWN:
69 kernel_power_off(); 108 kernel_power_off();
70 break; 109 break;
71 case PM_DISK_REBOOT: 110 case HIBERNATION_REBOOT:
72 kernel_restart(NULL); 111 kernel_restart(NULL);
73 break; 112 break;
74 default: 113 case HIBERNATION_PLATFORM:
75 if (pm_ops && pm_ops->enter) { 114 if (hibernation_ops) {
76 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
77 pm_ops->enter(PM_SUSPEND_DISK); 116 hibernation_ops->enter();
78 break; 117 break;
79 } 118 }
80 } 119 }
@@ -87,20 +126,6 @@ static void power_down(void)
87 while(1); 126 while(1);
88} 127}
89 128
90static inline void platform_finish(void)
91{
92 switch (pm_disk_mode) {
93 case PM_DISK_TEST:
94 case PM_DISK_TESTPROC:
95 case PM_DISK_SHUTDOWN:
96 case PM_DISK_REBOOT:
97 break;
98 default:
99 if (pm_ops && pm_ops->finish)
100 pm_ops->finish(PM_SUSPEND_DISK);
101 }
102}
103
104static void unprepare_processes(void) 129static void unprepare_processes(void)
105{ 130{
106 thaw_processes(); 131 thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
120} 145}
121 146
122/** 147/**
123 * pm_suspend_disk - The granpappy of hibernation power management. 148 * hibernate - The granpappy of the built-in hibernation management
124 *
125 * If not, then call swsusp to do its thing, then figure out how
126 * to power down the system.
127 */ 149 */
128 150
129int pm_suspend_disk(void) 151int hibernate(void)
130{ 152{
131 int error; 153 int error;
132 154
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
143 if (error) 165 if (error)
144 goto Finish; 166 goto Finish;
145 167
146 if (pm_disk_mode == PM_DISK_TESTPROC) { 168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) {
147 printk("swsusp debug: Waiting for 5 seconds.\n"); 170 printk("swsusp debug: Waiting for 5 seconds.\n");
148 mdelay(5000); 171 mdelay(5000);
149 goto Thaw; 172 goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
168 if (error) 191 if (error)
169 goto Enable_cpus; 192 goto Enable_cpus;
170 193
171 if (pm_disk_mode == PM_DISK_TEST) { 194 if (hibernation_mode == HIBERNATION_TEST) {
172 printk("swsusp debug: Waiting for 5 seconds.\n"); 195 printk("swsusp debug: Waiting for 5 seconds.\n");
173 mdelay(5000); 196 mdelay(5000);
174 goto Enable_cpus; 197 goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
205 device_resume(); 228 device_resume();
206 resume_console(); 229 resume_console();
207 Thaw: 230 Thaw:
231 mutex_unlock(&pm_mutex);
208 unprepare_processes(); 232 unprepare_processes();
209 Finish: 233 Finish:
210 free_basic_memory_bitmaps(); 234 free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
220 * Called as a late_initcall (so all devices are discovered and 244 * Called as a late_initcall (so all devices are discovered and
221 * initialized), we call swsusp to see if we have a saved image or not. 245 * initialized), we call swsusp to see if we have a saved image or not.
222 * If so, we quiesce devices, the restore the saved image. We will 246 * If so, we quiesce devices, the restore the saved image. We will
223 * return above (in pm_suspend_disk() ) if everything goes well. 247 * return above (in hibernate() ) if everything goes well.
224 * Otherwise, we fail gracefully and return to the normally 248 * Otherwise, we fail gracefully and return to the normally
225 * scheduled program. 249 * scheduled program.
226 * 250 *
@@ -315,25 +339,26 @@ static int software_resume(void)
315late_initcall(software_resume); 339late_initcall(software_resume);
316 340
317 341
318static const char * const pm_disk_modes[] = { 342static const char * const hibernation_modes[] = {
319 [PM_DISK_PLATFORM] = "platform", 343 [HIBERNATION_PLATFORM] = "platform",
320 [PM_DISK_SHUTDOWN] = "shutdown", 344 [HIBERNATION_SHUTDOWN] = "shutdown",
321 [PM_DISK_REBOOT] = "reboot", 345 [HIBERNATION_REBOOT] = "reboot",
322 [PM_DISK_TEST] = "test", 346 [HIBERNATION_TEST] = "test",
323 [PM_DISK_TESTPROC] = "testproc", 347 [HIBERNATION_TESTPROC] = "testproc",
324}; 348};
325 349
326/** 350/**
327 * disk - Control suspend-to-disk mode 351 * disk - Control hibernation mode
328 * 352 *
329 * Suspend-to-disk can be handled in several ways. We have a few options 353 * Suspend-to-disk can be handled in several ways. We have a few options
330 * for putting the system to sleep - using the platform driver (e.g. ACPI 354 * for putting the system to sleep - using the platform driver (e.g. ACPI
331 * or other pm_ops), powering off the system or rebooting the system 355 * or other hibernation_ops), powering off the system or rebooting the
332 * (for testing) as well as the two test modes. 356 * system (for testing) as well as the two test modes.
333 * 357 *
334 * The system can support 'platform', and that is known a priori (and 358 * The system can support 'platform', and that is known a priori (and
335 * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' 359 * encoded by the presence of hibernation_ops). However, the user may
336 * as alternatives, as well as the test modes 'test' and 'testproc'. 360 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
361 * test modes, 'test' or 'testproc'.
337 * 362 *
338 * show() will display what the mode is currently set to. 363 * show() will display what the mode is currently set to.
339 * store() will accept one of 364 * store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
345 * 'testproc' 370 * 'testproc'
346 * 371 *
347 * It will only change to 'platform' if the system 372 * It will only change to 'platform' if the system
348 * supports it (as determined from pm_ops->pm_disk_mode). 373 * supports it (as determined by having hibernation_ops).
349 */ 374 */
350 375
351static ssize_t disk_show(struct kset *kset, char *buf) 376static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
353 int i; 378 int i;
354 char *start = buf; 379 char *start = buf;
355 380
356 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 381 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
357 if (!pm_disk_modes[i]) 382 if (!hibernation_modes[i])
358 continue; 383 continue;
359 switch (i) { 384 switch (i) {
360 case PM_DISK_SHUTDOWN: 385 case HIBERNATION_SHUTDOWN:
361 case PM_DISK_REBOOT: 386 case HIBERNATION_REBOOT:
362 case PM_DISK_TEST: 387 case HIBERNATION_TEST:
363 case PM_DISK_TESTPROC: 388 case HIBERNATION_TESTPROC:
364 break; 389 break;
365 default: 390 case HIBERNATION_PLATFORM:
366 if (pm_ops && pm_ops->enter && 391 if (hibernation_ops)
367 (i == pm_ops->pm_disk_mode))
368 break; 392 break;
369 /* not a valid mode, continue with loop */ 393 /* not a valid mode, continue with loop */
370 continue; 394 continue;
371 } 395 }
372 if (i == pm_disk_mode) 396 if (i == hibernation_mode)
373 buf += sprintf(buf, "[%s]", pm_disk_modes[i]); 397 buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
374 else 398 else
375 buf += sprintf(buf, "%s", pm_disk_modes[i]); 399 buf += sprintf(buf, "%s ", hibernation_modes[i]);
376 if (i+1 != PM_DISK_MAX)
377 buf += sprintf(buf, " ");
378 } 400 }
379 buf += sprintf(buf, "\n"); 401 buf += sprintf(buf, "\n");
380 return buf-start; 402 return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
387 int i; 409 int i;
388 int len; 410 int len;
389 char *p; 411 char *p;
390 suspend_disk_method_t mode = 0; 412 int mode = HIBERNATION_INVALID;
391 413
392 p = memchr(buf, '\n', n); 414 p = memchr(buf, '\n', n);
393 len = p ? p - buf : n; 415 len = p ? p - buf : n;
394 416
395 mutex_lock(&pm_mutex); 417 mutex_lock(&pm_mutex);
396 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 418 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
397 if (!strncmp(buf, pm_disk_modes[i], len)) { 419 if (!strncmp(buf, hibernation_modes[i], len)) {
398 mode = i; 420 mode = i;
399 break; 421 break;
400 } 422 }
401 } 423 }
402 if (mode) { 424 if (mode != HIBERNATION_INVALID) {
403 switch (mode) { 425 switch (mode) {
404 case PM_DISK_SHUTDOWN: 426 case HIBERNATION_SHUTDOWN:
405 case PM_DISK_REBOOT: 427 case HIBERNATION_REBOOT:
406 case PM_DISK_TEST: 428 case HIBERNATION_TEST:
407 case PM_DISK_TESTPROC: 429 case HIBERNATION_TESTPROC:
408 pm_disk_mode = mode; 430 hibernation_mode = mode;
409 break; 431 break;
410 default: 432 case HIBERNATION_PLATFORM:
411 if (pm_ops && pm_ops->enter && 433 if (hibernation_ops)
412 (mode == pm_ops->pm_disk_mode)) 434 hibernation_mode = mode;
413 pm_disk_mode = mode;
414 else 435 else
415 error = -EINVAL; 436 error = -EINVAL;
416 } 437 }
417 } else { 438 } else
418 error = -EINVAL; 439 error = -EINVAL;
419 }
420 440
421 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 441 if (!error)
422 pm_disk_modes[mode]); 442 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
443 hibernation_modes[mode]);
423 mutex_unlock(&pm_mutex); 444 mutex_unlock(&pm_mutex);
424 return error ? error : n; 445 return error ? error : n;
425} 446}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
30DEFINE_MUTEX(pm_mutex); 30DEFINE_MUTEX(pm_mutex);
31 31
32struct pm_ops *pm_ops; 32struct pm_ops *pm_ops;
33suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
34 33
35/** 34/**
36 * pm_set_ops - Set the global power method table. 35 * pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
41{ 40{
42 mutex_lock(&pm_mutex); 41 mutex_lock(&pm_mutex);
43 pm_ops = ops; 42 pm_ops = ops;
44 if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
45 pm_disk_mode = ops->pm_disk_mode;
46 } else
47 pm_disk_mode = PM_DISK_SHUTDOWN;
48 mutex_unlock(&pm_mutex); 43 mutex_unlock(&pm_mutex);
49} 44}
50 45
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
184static const char * const pm_states[PM_SUSPEND_MAX] = { 179static const char * const pm_states[PM_SUSPEND_MAX] = {
185 [PM_SUSPEND_STANDBY] = "standby", 180 [PM_SUSPEND_STANDBY] = "standby",
186 [PM_SUSPEND_MEM] = "mem", 181 [PM_SUSPEND_MEM] = "mem",
187 [PM_SUSPEND_DISK] = "disk",
188}; 182};
189 183
190static inline int valid_state(suspend_state_t state) 184static inline int valid_state(suspend_state_t state)
191{ 185{
192 /* Suspend-to-disk does not really need low-level support. 186 /* All states need lowlevel support and need to be valid
193 * It can work with shutdown/reboot if needed. If it isn't 187 * to the lowlevel implementation, no valid callback
194 * configured, then it cannot be supported.
195 */
196 if (state == PM_SUSPEND_DISK)
197#ifdef CONFIG_SOFTWARE_SUSPEND
198 return 1;
199#else
200 return 0;
201#endif
202
203 /* all other states need lowlevel support and need to be
204 * valid to the lowlevel implementation, no valid callback
205 * implies that none are valid. */ 188 * implies that none are valid. */
206 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) 189 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
207 return 0; 190 return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
229 if (!mutex_trylock(&pm_mutex)) 212 if (!mutex_trylock(&pm_mutex))
230 return -EBUSY; 213 return -EBUSY;
231 214
232 if (state == PM_SUSPEND_DISK) {
233 error = pm_suspend_disk();
234 goto Unlock;
235 }
236
237 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 215 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
238 if ((error = suspend_prepare(state))) 216 if ((error = suspend_prepare(state)))
239 goto Unlock; 217 goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
251 229
252/** 230/**
253 * pm_suspend - Externally visible function for suspending system. 231 * pm_suspend - Externally visible function for suspending system.
254 * @state: Enumarted value of state to enter. 232 * @state: Enumerated value of state to enter.
255 * 233 *
256 * Determine whether or not value is within range, get state 234 * Determine whether or not value is within range, get state
257 * structure, and enter (above). 235 * structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
289 if (pm_states[i] && valid_state(i)) 267 if (pm_states[i] && valid_state(i))
290 s += sprintf(s,"%s ", pm_states[i]); 268 s += sprintf(s,"%s ", pm_states[i]);
291 } 269 }
292 s += sprintf(s,"\n"); 270#ifdef CONFIG_SOFTWARE_SUSPEND
271 s += sprintf(s, "%s\n", "disk");
272#else
273 if (s != buf)
274 /* convert the last space to a newline */
275 *(s-1) = '\n';
276#endif
293 return (s - buf); 277 return (s - buf);
294} 278}
295 279
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
304 p = memchr(buf, '\n', n); 288 p = memchr(buf, '\n', n);
305 len = p ? p - buf : n; 289 len = p ? p - buf : n;
306 290
291 /* First, check if we are requested to hibernate */
292 if (!strncmp(buf, "disk", len)) {
293 error = hibernate();
294 return error ? error : n;
295 }
296
307 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 297 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
308 if (*s && !strncmp(buf, *s, len)) 298 if (*s && !strncmp(buf, *s, len))
309 break; 299 break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern int pm_suspend_disk(void); 28extern struct hibernation_ops *hibernation_ops;
29#else
30static inline int pm_suspend_disk(void)
31{
32 return -EPERM;
33}
34#endif 29#endif
35 30
36extern int pfn_is_nosave(unsigned long); 31extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 59fb89ba9a4d..a3b7854b8f7c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1233,7 +1233,7 @@ asmlinkage int swsusp_save(void)
1233 nr_copy_pages = nr_pages; 1233 nr_copy_pages = nr_pages;
1234 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); 1234 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
1235 1235
1236 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1236 printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
1237 1237
1238 return 0; 1238 return 0;
1239} 1239}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
130{ 130{
131 int error = 0; 131 int error = 0;
132 132
133 if (pm_ops && pm_ops->prepare) 133 if (hibernation_ops)
134 error = pm_ops->prepare(PM_SUSPEND_DISK); 134 error = hibernation_ops->prepare();
135 135
136 return error; 136 return error;
137} 137}
138 138
139static inline void platform_finish(void) 139static inline void platform_finish(void)
140{ 140{
141 if (pm_ops && pm_ops->finish) 141 if (hibernation_ops)
142 pm_ops->finish(PM_SUSPEND_DISK); 142 hibernation_ops->finish();
143} 143}
144 144
145static inline int snapshot_suspend(int platform_suspend) 145static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
384 switch (arg) { 384 switch (arg) {
385 385
386 case PMOPS_PREPARE: 386 case PMOPS_PREPARE:
387 if (pm_ops && pm_ops->enter) { 387 if (hibernation_ops) {
388 data->platform_suspend = 1; 388 data->platform_suspend = 1;
389 error = 0; 389 error = 0;
390 } else { 390 } else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
395 case PMOPS_ENTER: 395 case PMOPS_ENTER:
396 if (data->platform_suspend) { 396 if (data->platform_suspend) {
397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
398 error = pm_ops->enter(PM_SUSPEND_DISK); 398 error = hibernation_ops->enter();
399 error = 0;
400 } 399 }
401 break; 400 break;
402 401
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
340 340
341 switch (action) { 341 switch (action) {
342 case CPU_UP_PREPARE: 342 case CPU_UP_PREPARE:
343 case CPU_UP_PREPARE_FROZEN:
343 node = cpu_to_node(cpu); 344 node = cpu_to_node(cpu);
344 per_cpu(cpu_profile_flip, cpu) = 0; 345 per_cpu(cpu_profile_flip, cpu) = 0;
345 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 346 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
365 __free_page(page); 366 __free_page(page);
366 return NOTIFY_BAD; 367 return NOTIFY_BAD;
367 case CPU_ONLINE: 368 case CPU_ONLINE:
369 case CPU_ONLINE_FROZEN:
368 cpu_set(cpu, prof_cpu_mask); 370 cpu_set(cpu, prof_cpu_mask);
369 break; 371 break;
370 case CPU_UP_CANCELED: 372 case CPU_UP_CANCELED:
373 case CPU_UP_CANCELED_FROZEN:
371 case CPU_DEAD: 374 case CPU_DEAD:
375 case CPU_DEAD_FROZEN:
372 cpu_clear(cpu, prof_cpu_mask); 376 cpu_clear(cpu, prof_cpu_mask);
373 if (per_cpu(cpu_profile_hits, cpu)[0]) { 377 if (per_cpu(cpu_profile_hits, cpu)[0]) {
374 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 378 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
558 long cpu = (long)hcpu; 558 long cpu = (long)hcpu;
559 switch (action) { 559 switch (action) {
560 case CPU_UP_PREPARE: 560 case CPU_UP_PREPARE:
561 case CPU_UP_PREPARE_FROZEN:
561 rcu_online_cpu(cpu); 562 rcu_online_cpu(cpu);
562 break; 563 break;
563 case CPU_DEAD: 564 case CPU_DEAD:
565 case CPU_DEAD_FROZEN:
564 rcu_offline_cpu(cpu); 566 rcu_offline_cpu(cpu);
565 break; 567 break;
566 default: 568 default:
diff --git a/kernel/relay.c b/kernel/relay.c
index 577f251c7e28..4311101b0ca7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
310 310
311/** 311/**
312 * wakeup_readers - wake up readers waiting on a channel 312 * wakeup_readers - wake up readers waiting on a channel
313 * @work: work struct that contains the the channel buffer 313 * @data: contains the channel buffer
314 * 314 *
315 * This is the work function used to defer reader waking. The 315 * This is the timer function used to defer reader waking.
316 * reason waking is deferred is that calling directly from write
317 * causes problems if you're writing from say the scheduler.
318 */ 316 */
319static void wakeup_readers(struct work_struct *work) 317static void wakeup_readers(unsigned long data)
320{ 318{
321 struct rchan_buf *buf = 319 struct rchan_buf *buf = (struct rchan_buf *)data;
322 container_of(work, struct rchan_buf, wake_readers.work);
323 wake_up_interruptible(&buf->read_wait); 320 wake_up_interruptible(&buf->read_wait);
324} 321}
325 322
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
337 if (init) { 334 if (init) {
338 init_waitqueue_head(&buf->read_wait); 335 init_waitqueue_head(&buf->read_wait);
339 kref_init(&buf->kref); 336 kref_init(&buf->kref);
340 INIT_DELAYED_WORK(&buf->wake_readers, NULL); 337 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
341 } else { 338 } else
342 cancel_delayed_work(&buf->wake_readers); 339 del_timer_sync(&buf->timer);
343 flush_scheduled_work();
344 }
345 340
346 buf->subbufs_produced = 0; 341 buf->subbufs_produced = 0;
347 buf->subbufs_consumed = 0; 342 buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
447static void relay_close_buf(struct rchan_buf *buf) 442static void relay_close_buf(struct rchan_buf *buf)
448{ 443{
449 buf->finalized = 1; 444 buf->finalized = 1;
450 cancel_delayed_work(&buf->wake_readers); 445 del_timer_sync(&buf->timer);
451 flush_scheduled_work();
452 kref_put(&buf->kref, relay_remove_buf); 446 kref_put(&buf->kref, relay_remove_buf);
453} 447}
454 448
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
490 484
491 switch(action) { 485 switch(action) {
492 case CPU_UP_PREPARE: 486 case CPU_UP_PREPARE:
487 case CPU_UP_PREPARE_FROZEN:
493 mutex_lock(&relay_channels_mutex); 488 mutex_lock(&relay_channels_mutex);
494 list_for_each_entry(chan, &relay_channels, list) { 489 list_for_each_entry(chan, &relay_channels, list) {
495 if (chan->buf[hotcpu]) 490 if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
506 mutex_unlock(&relay_channels_mutex); 501 mutex_unlock(&relay_channels_mutex);
507 break; 502 break;
508 case CPU_DEAD: 503 case CPU_DEAD:
504 case CPU_DEAD_FROZEN:
509 /* No need to flush the cpu : will be flushed upon 505 /* No need to flush the cpu : will be flushed upon
510 * final relay_flush() call. */ 506 * final relay_flush() call. */
511 break; 507 break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
608 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 604 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
609 buf->padding[old_subbuf]; 605 buf->padding[old_subbuf];
610 smp_mb(); 606 smp_mb();
611 if (waitqueue_active(&buf->read_wait)) { 607 if (waitqueue_active(&buf->read_wait))
612 PREPARE_DELAYED_WORK(&buf->wake_readers, 608 /*
613 wakeup_readers); 609 * Calling wake_up_interruptible() from here
614 schedule_delayed_work(&buf->wake_readers, 1); 610 * will deadlock if we happen to be logging
615 } 611 * from the scheduler (trying to re-grab
612 * rq->lock), so defer it.
613 */
614 __mod_timer(&buf->timer, jiffies + 1);
616 } 615 }
617 616
618 old = buf->data; 617 old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
56 * state. 56 * state.
57 */ 57 */
58 58
59static void 59void
60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
61 unsigned long mask) 61 unsigned long mask)
62{ 62{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
81} 81}
82 82
83/* 83/*
84 * We can speed up the acquire/release, if the architecture
85 * supports cmpxchg and if there's no debugging state to be set up
86 */
87#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
88# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
89static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90{
91 unsigned long owner, *p = (unsigned long *) &lock->owner;
92
93 do {
94 owner = *p;
95 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
96}
97#else
98# define rt_mutex_cmpxchg(l,c,n) (0)
99static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
100{
101 lock->owner = (struct task_struct *)
102 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
103}
104#endif
105
106/*
107 * Calculate task priority from the waiter list priority 84 * Calculate task priority from the waiter list priority
108 * 85 *
109 * Return task->normal_prio when the waiter list is empty or when 86 * Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
123 * 100 *
124 * This can be both boosting and unboosting. task->pi_lock must be held. 101 * This can be both boosting and unboosting. task->pi_lock must be held.
125 */ 102 */
126static void __rt_mutex_adjust_prio(struct task_struct *task) 103void __rt_mutex_adjust_prio(struct task_struct *task)
127{ 104{
128 int prio = rt_mutex_getprio(task); 105 int prio = rt_mutex_getprio(task);
129 106
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
159 * Decreases task's usage by one - may thus free the task. 136 * Decreases task's usage by one - may thus free the task.
160 * Returns 0 or -EDEADLK. 137 * Returns 0 or -EDEADLK.
161 */ 138 */
162static int rt_mutex_adjust_prio_chain(struct task_struct *task, 139int rt_mutex_adjust_prio_chain(struct task_struct *task,
163 int deadlock_detect, 140 int deadlock_detect,
164 struct rt_mutex *orig_lock, 141 struct rt_mutex *orig_lock,
165 struct rt_mutex_waiter *orig_waiter, 142 struct rt_mutex_waiter *orig_waiter,
166 struct task_struct *top_task) 143 struct task_struct *top_task)
167{ 144{
168 struct rt_mutex *lock; 145 struct rt_mutex *lock;
169 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 146 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
524 * 501 *
525 * Must be called with lock->wait_lock held 502 * Must be called with lock->wait_lock held
526 */ 503 */
527static void remove_waiter(struct rt_mutex *lock, 504void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter) 505 struct rt_mutex_waiter *waiter)
529{ 506{
530 int first = (waiter == rt_mutex_top_waiter(lock)); 507 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock); 508 struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
113} 113}
114 114
115/* 115/*
116 * We can speed up the acquire/release, if the architecture
117 * supports cmpxchg and if there's no debugging state to be set up
118 */
119#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
120# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
121static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
122{
123 unsigned long owner, *p = (unsigned long *) &lock->owner;
124
125 do {
126 owner = *p;
127 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
128}
129#else
130# define rt_mutex_cmpxchg(l,c,n) (0)
131static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
132{
133 lock->owner = (struct task_struct *)
134 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
135}
136#endif
137
138/*
116 * PI-futex support (proxy locking functions, etc.): 139 * PI-futex support (proxy locking functions, etc.):
117 */ 140 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 141extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 143 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 144extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 145 struct task_struct *proxy_owner);
146
147extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
148 unsigned long mask);
149extern void __rt_mutex_adjust_prio(struct task_struct *task);
150extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
151 int deadlock_detect,
152 struct rt_mutex *orig_lock,
153 struct rt_mutex_waiter *orig_waiter,
154 struct task_struct *top_task);
155extern void remove_waiter(struct rt_mutex *lock,
156 struct rt_mutex_waiter *waiter);
123#endif 157#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff23f18..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
305}; 305};
306 306
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex);
308 309
309static inline int cpu_of(struct rq *rq) 310static inline int cpu_of(struct rq *rq)
310{ 311{
@@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4520 struct task_struct *p; 4521 struct task_struct *p;
4521 int retval; 4522 int retval;
4522 4523
4523 lock_cpu_hotplug(); 4524 mutex_lock(&sched_hotcpu_mutex);
4524 read_lock(&tasklist_lock); 4525 read_lock(&tasklist_lock);
4525 4526
4526 p = find_process_by_pid(pid); 4527 p = find_process_by_pid(pid);
4527 if (!p) { 4528 if (!p) {
4528 read_unlock(&tasklist_lock); 4529 read_unlock(&tasklist_lock);
4529 unlock_cpu_hotplug(); 4530 mutex_unlock(&sched_hotcpu_mutex);
4530 return -ESRCH; 4531 return -ESRCH;
4531 } 4532 }
4532 4533
@@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4553 4554
4554out_unlock: 4555out_unlock:
4555 put_task_struct(p); 4556 put_task_struct(p);
4556 unlock_cpu_hotplug(); 4557 mutex_unlock(&sched_hotcpu_mutex);
4557 return retval; 4558 return retval;
4558} 4559}
4559 4560
@@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4610 struct task_struct *p; 4611 struct task_struct *p;
4611 int retval; 4612 int retval;
4612 4613
4613 lock_cpu_hotplug(); 4614 mutex_lock(&sched_hotcpu_mutex);
4614 read_lock(&tasklist_lock); 4615 read_lock(&tasklist_lock);
4615 4616
4616 retval = -ESRCH; 4617 retval = -ESRCH;
@@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4626 4627
4627out_unlock: 4628out_unlock:
4628 read_unlock(&tasklist_lock); 4629 read_unlock(&tasklist_lock);
4629 unlock_cpu_hotplug(); 4630 mutex_unlock(&sched_hotcpu_mutex);
4630 if (retval) 4631 if (retval)
4631 return retval; 4632 return retval;
4632 4633
@@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5388 struct rq *rq; 5389 struct rq *rq;
5389 5390
5390 switch (action) { 5391 switch (action) {
5392 case CPU_LOCK_ACQUIRE:
5393 mutex_lock(&sched_hotcpu_mutex);
5394 break;
5395
5391 case CPU_UP_PREPARE: 5396 case CPU_UP_PREPARE:
5397 case CPU_UP_PREPARE_FROZEN:
5392 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5398 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
5393 if (IS_ERR(p)) 5399 if (IS_ERR(p))
5394 return NOTIFY_BAD; 5400 return NOTIFY_BAD;
@@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5402 break; 5408 break;
5403 5409
5404 case CPU_ONLINE: 5410 case CPU_ONLINE:
5411 case CPU_ONLINE_FROZEN:
5405 /* Strictly unneccessary, as first user will wake it. */ 5412 /* Strictly unneccessary, as first user will wake it. */
5406 wake_up_process(cpu_rq(cpu)->migration_thread); 5413 wake_up_process(cpu_rq(cpu)->migration_thread);
5407 break; 5414 break;
5408 5415
5409#ifdef CONFIG_HOTPLUG_CPU 5416#ifdef CONFIG_HOTPLUG_CPU
5410 case CPU_UP_CANCELED: 5417 case CPU_UP_CANCELED:
5418 case CPU_UP_CANCELED_FROZEN:
5411 if (!cpu_rq(cpu)->migration_thread) 5419 if (!cpu_rq(cpu)->migration_thread)
5412 break; 5420 break;
5413 /* Unbind it from offline cpu so it can run. Fall thru. */ 5421 /* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5418 break; 5426 break;
5419 5427
5420 case CPU_DEAD: 5428 case CPU_DEAD:
5429 case CPU_DEAD_FROZEN:
5421 migrate_live_tasks(cpu); 5430 migrate_live_tasks(cpu);
5422 rq = cpu_rq(cpu); 5431 rq = cpu_rq(cpu);
5423 kthread_stop(rq->migration_thread); 5432 kthread_stop(rq->migration_thread);
@@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5433 BUG_ON(rq->nr_running != 0); 5442 BUG_ON(rq->nr_running != 0);
5434 5443
5435 /* No need to migrate the tasks: it was best-effort if 5444 /* No need to migrate the tasks: it was best-effort if
5436 * they didn't do lock_cpu_hotplug(). Just wake up 5445 * they didn't take sched_hotcpu_mutex. Just wake up
5437 * the requestors. */ 5446 * the requestors. */
5438 spin_lock_irq(&rq->lock); 5447 spin_lock_irq(&rq->lock);
5439 while (!list_empty(&rq->migration_queue)) { 5448 while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5447 spin_unlock_irq(&rq->lock); 5456 spin_unlock_irq(&rq->lock);
5448 break; 5457 break;
5449#endif 5458#endif
5459 case CPU_LOCK_RELEASE:
5460 mutex_unlock(&sched_hotcpu_mutex);
5461 break;
5450 } 5462 }
5451 return NOTIFY_OK; 5463 return NOTIFY_OK;
5452} 5464}
@@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void)
6822{ 6834{
6823 int err; 6835 int err;
6824 6836
6825 lock_cpu_hotplug(); 6837 mutex_lock(&sched_hotcpu_mutex);
6826 detach_destroy_domains(&cpu_online_map); 6838 detach_destroy_domains(&cpu_online_map);
6827 err = arch_init_sched_domains(&cpu_online_map); 6839 err = arch_init_sched_domains(&cpu_online_map);
6828 unlock_cpu_hotplug(); 6840 mutex_unlock(&sched_hotcpu_mutex);
6829 6841
6830 return err; 6842 return err;
6831} 6843}
@@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
6904{ 6916{
6905 switch (action) { 6917 switch (action) {
6906 case CPU_UP_PREPARE: 6918 case CPU_UP_PREPARE:
6919 case CPU_UP_PREPARE_FROZEN:
6907 case CPU_DOWN_PREPARE: 6920 case CPU_DOWN_PREPARE:
6921 case CPU_DOWN_PREPARE_FROZEN:
6908 detach_destroy_domains(&cpu_online_map); 6922 detach_destroy_domains(&cpu_online_map);
6909 return NOTIFY_OK; 6923 return NOTIFY_OK;
6910 6924
6911 case CPU_UP_CANCELED: 6925 case CPU_UP_CANCELED:
6926 case CPU_UP_CANCELED_FROZEN:
6912 case CPU_DOWN_FAILED: 6927 case CPU_DOWN_FAILED:
6928 case CPU_DOWN_FAILED_FROZEN:
6913 case CPU_ONLINE: 6929 case CPU_ONLINE:
6930 case CPU_ONLINE_FROZEN:
6914 case CPU_DEAD: 6931 case CPU_DEAD:
6932 case CPU_DEAD_FROZEN:
6915 /* 6933 /*
6916 * Fall through and re-initialise the domains. 6934 * Fall through and re-initialise the domains.
6917 */ 6935 */
@@ -6930,12 +6948,12 @@ void __init sched_init_smp(void)
6930{ 6948{
6931 cpumask_t non_isolated_cpus; 6949 cpumask_t non_isolated_cpus;
6932 6950
6933 lock_cpu_hotplug(); 6951 mutex_lock(&sched_hotcpu_mutex);
6934 arch_init_sched_domains(&cpu_online_map); 6952 arch_init_sched_domains(&cpu_online_map);
6935 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 6953 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6936 if (cpus_empty(non_isolated_cpus)) 6954 if (cpus_empty(non_isolated_cpus))
6937 cpu_set(smp_processor_id(), non_isolated_cpus); 6955 cpu_set(smp_processor_id(), non_isolated_cpus);
6938 unlock_cpu_hotplug(); 6956 mutex_unlock(&sched_hotcpu_mutex);
6939 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6957 /* XXX: Theoretical race here - CPU may be hotplugged now */
6940 hotcpu_notifier(update_sched_domains, 0); 6958 hotcpu_notifier(update_sched_domains, 0);
6941 6959
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
38 38
39static struct kmem_cache *sigqueue_cachep; 39static struct kmem_cache *sigqueue_cachep;
40 40
41/*
42 * In POSIX a signal is sent either to a specific thread (Linux task)
43 * or to the process as a whole (Linux thread group). How the signal
44 * is sent determines whether it's to one thread or the whole group,
45 * which determines which signal mask(s) are involved in blocking it
46 * from being delivered until later. When the signal is delivered,
47 * either it's caught or ignored by a user handler or it has a default
48 * effect that applies to the whole thread group (POSIX process).
49 *
50 * The possible effects an unblocked signal set to SIG_DFL can have are:
51 * ignore - Nothing Happens
52 * terminate - kill the process, i.e. all threads in the group,
53 * similar to exit_group. The group leader (only) reports
54 * WIFSIGNALED status to its parent.
55 * coredump - write a core dump file describing all threads using
56 * the same mm and then kill all those threads
57 * stop - stop all the threads in the group, i.e. TASK_STOPPED state
58 *
59 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
60 * Other signals when not blocked and set to SIG_DFL behaves as follows.
61 * The job control signals also have other special effects.
62 *
63 * +--------------------+------------------+
64 * | POSIX signal | default action |
65 * +--------------------+------------------+
66 * | SIGHUP | terminate |
67 * | SIGINT | terminate |
68 * | SIGQUIT | coredump |
69 * | SIGILL | coredump |
70 * | SIGTRAP | coredump |
71 * | SIGABRT/SIGIOT | coredump |
72 * | SIGBUS | coredump |
73 * | SIGFPE | coredump |
74 * | SIGKILL | terminate(+) |
75 * | SIGUSR1 | terminate |
76 * | SIGSEGV | coredump |
77 * | SIGUSR2 | terminate |
78 * | SIGPIPE | terminate |
79 * | SIGALRM | terminate |
80 * | SIGTERM | terminate |
81 * | SIGCHLD | ignore |
82 * | SIGCONT | ignore(*) |
83 * | SIGSTOP | stop(*)(+) |
84 * | SIGTSTP | stop(*) |
85 * | SIGTTIN | stop(*) |
86 * | SIGTTOU | stop(*) |
87 * | SIGURG | ignore |
88 * | SIGXCPU | coredump |
89 * | SIGXFSZ | coredump |
90 * | SIGVTALRM | terminate |
91 * | SIGPROF | terminate |
92 * | SIGPOLL/SIGIO | terminate |
93 * | SIGSYS/SIGUNUSED | coredump |
94 * | SIGSTKFLT | terminate |
95 * | SIGWINCH | ignore |
96 * | SIGPWR | terminate |
97 * | SIGRTMIN-SIGRTMAX | terminate |
98 * +--------------------+------------------+
99 * | non-POSIX signal | default action |
100 * +--------------------+------------------+
101 * | SIGEMT | coredump |
102 * +--------------------+------------------+
103 *
104 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
105 * (*) Special job control effects:
106 * When SIGCONT is sent, it resumes the process (all threads in the group)
107 * from TASK_STOPPED state and also clears any pending/queued stop signals
108 * (any of those marked with "stop(*)"). This happens regardless of blocking,
109 * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
110 * any pending/queued SIGCONT signals; this happens regardless of blocking,
111 * catching, or ignored the stop signal, though (except for SIGSTOP) the
112 * default action of stopping the process may happen later or never.
113 */
114
115#ifdef SIGEMT
116#define M_SIGEMT M(SIGEMT)
117#else
118#define M_SIGEMT 0
119#endif
120
121#if SIGRTMIN > BITS_PER_LONG
122#define M(sig) (1ULL << ((sig)-1))
123#else
124#define M(sig) (1UL << ((sig)-1))
125#endif
126#define T(sig, mask) (M(sig) & (mask))
127
128#define SIG_KERNEL_ONLY_MASK (\
129 M(SIGKILL) | M(SIGSTOP) )
130
131#define SIG_KERNEL_STOP_MASK (\
132 M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
133
134#define SIG_KERNEL_COREDUMP_MASK (\
135 M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
136 M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
137 M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
138
139#define SIG_KERNEL_IGNORE_MASK (\
140 M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
141
142#define sig_kernel_only(sig) \
143 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
144#define sig_kernel_coredump(sig) \
145 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
146#define sig_kernel_ignore(sig) \
147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
148#define sig_kernel_stop(sig) \
149 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
150
151#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
152
153#define sig_user_defined(t, signr) \
154 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
155 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
156
157#define sig_fatal(t, signr) \
158 (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
159 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
160 41
161static int sig_ignored(struct task_struct *t, int sig) 42static int sig_ignored(struct task_struct *t, int sig)
162{ 43{
@@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)
328 spin_unlock_irqrestore(&t->sighand->siglock, flags); 209 spin_unlock_irqrestore(&t->sighand->siglock, flags);
329} 210}
330 211
212void ignore_signals(struct task_struct *t)
213{
214 int i;
215
216 for (i = 0; i < _NSIG; ++i)
217 t->sighand->action[i].sa.sa_handler = SIG_IGN;
218
219 flush_signals(t);
220}
221
331/* 222/*
332 * Flush all handlers for a task. 223 * Flush all handlers for a task.
333 */ 224 */
@@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
1032 if (t->exit_state) 923 if (t->exit_state)
1033 continue; 924 continue;
1034 925
1035 /*
1036 * We don't want to notify the parent, since we are
1037 * killed as part of a thread group due to another
1038 * thread doing an execve() or similar. So set the
1039 * exit signal to -1 to allow immediate reaping of
1040 * the process. But don't detach the thread group
1041 * leader.
1042 */
1043 if (t != p->group_leader)
1044 t->exit_signal = -1;
1045
1046 /* SIGKILL will be handled before any pending SIGSTOP */ 926 /* SIGKILL will be handled before any pending SIGSTOP */
1047 sigaddset(&t->pending.signal, SIGKILL); 927 sigaddset(&t->pending.signal, SIGKILL);
1048 signal_wake_up(t, 1); 928 signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
593 593
594 switch (action) { 594 switch (action) {
595 case CPU_UP_PREPARE: 595 case CPU_UP_PREPARE:
596 case CPU_UP_PREPARE_FROZEN:
596 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 597 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
597 if (IS_ERR(p)) { 598 if (IS_ERR(p)) {
598 printk("ksoftirqd for %i failed\n", hotcpu); 599 printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
602 per_cpu(ksoftirqd, hotcpu) = p; 603 per_cpu(ksoftirqd, hotcpu) = p;
603 break; 604 break;
604 case CPU_ONLINE: 605 case CPU_ONLINE:
606 case CPU_ONLINE_FROZEN:
605 wake_up_process(per_cpu(ksoftirqd, hotcpu)); 607 wake_up_process(per_cpu(ksoftirqd, hotcpu));
606 break; 608 break;
607#ifdef CONFIG_HOTPLUG_CPU 609#ifdef CONFIG_HOTPLUG_CPU
608 case CPU_UP_CANCELED: 610 case CPU_UP_CANCELED:
611 case CPU_UP_CANCELED_FROZEN:
609 if (!per_cpu(ksoftirqd, hotcpu)) 612 if (!per_cpu(ksoftirqd, hotcpu))
610 break; 613 break;
611 /* Unbind so it can run. Fall thru. */ 614 /* Unbind so it can run. Fall thru. */
612 kthread_bind(per_cpu(ksoftirqd, hotcpu), 615 kthread_bind(per_cpu(ksoftirqd, hotcpu),
613 any_online_cpu(cpu_online_map)); 616 any_online_cpu(cpu_online_map));
614 case CPU_DEAD: 617 case CPU_DEAD:
618 case CPU_DEAD_FROZEN:
615 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
616 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
617 kthread_stop(p); 621 kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
146 146
147 switch (action) { 147 switch (action) {
148 case CPU_UP_PREPARE: 148 case CPU_UP_PREPARE:
149 case CPU_UP_PREPARE_FROZEN:
149 BUG_ON(per_cpu(watchdog_task, hotcpu)); 150 BUG_ON(per_cpu(watchdog_task, hotcpu));
150 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 151 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
151 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
157 kthread_bind(p, hotcpu); 158 kthread_bind(p, hotcpu);
158 break; 159 break;
159 case CPU_ONLINE: 160 case CPU_ONLINE:
161 case CPU_ONLINE_FROZEN:
160 wake_up_process(per_cpu(watchdog_task, hotcpu)); 162 wake_up_process(per_cpu(watchdog_task, hotcpu));
161 break; 163 break;
162#ifdef CONFIG_HOTPLUG_CPU 164#ifdef CONFIG_HOTPLUG_CPU
163 case CPU_UP_CANCELED: 165 case CPU_UP_CANCELED:
166 case CPU_UP_CANCELED_FROZEN:
164 if (!per_cpu(watchdog_task, hotcpu)) 167 if (!per_cpu(watchdog_task, hotcpu))
165 break; 168 break;
166 /* Unbind so it can run. Fall thru. */ 169 /* Unbind so it can run. Fall thru. */
167 kthread_bind(per_cpu(watchdog_task, hotcpu), 170 kthread_bind(per_cpu(watchdog_task, hotcpu),
168 any_online_cpu(cpu_online_map)); 171 any_online_cpu(cpu_online_map));
169 case CPU_DEAD: 172 case CPU_DEAD:
173 case CPU_DEAD_FROZEN:
170 p = per_cpu(watchdog_task, hotcpu); 174 p = per_cpu(watchdog_task, hotcpu);
171 per_cpu(watchdog_task, hotcpu) = NULL; 175 per_cpu(watchdog_task, hotcpu) = NULL;
172 kthread_stop(p); 176 kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 926bf9d7ac45..cdb7e9457ba6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
134 return -ENOENT; 134 return -ENOENT;
135} 135}
136 136
137/**
138 * notifier_call_chain - Informs the registered notifiers about an event.
139 * @nl: Pointer to head of the blocking notifier chain
140 * @val: Value passed unmodified to notifier function
141 * @v: Pointer passed unmodified to notifier function
142 * @nr_to_call: Number of notifier functions to be called. Don't care
143 * value of this parameter is -1.
144 * @nr_calls: Records the number of notifications sent. Don't care
145 * value of this field is NULL.
146 * @returns: notifier_call_chain returns the value returned by the
147 * last notifier function called.
148 */
149
137static int __kprobes notifier_call_chain(struct notifier_block **nl, 150static int __kprobes notifier_call_chain(struct notifier_block **nl,
138 unsigned long val, void *v) 151 unsigned long val, void *v,
152 int nr_to_call, int *nr_calls)
139{ 153{
140 int ret = NOTIFY_DONE; 154 int ret = NOTIFY_DONE;
141 struct notifier_block *nb, *next_nb; 155 struct notifier_block *nb, *next_nb;
142 156
143 nb = rcu_dereference(*nl); 157 nb = rcu_dereference(*nl);
144 while (nb) { 158
159 while (nb && nr_to_call) {
145 next_nb = rcu_dereference(nb->next); 160 next_nb = rcu_dereference(nb->next);
146 ret = nb->notifier_call(nb, val, v); 161 ret = nb->notifier_call(nb, val, v);
162
163 if (nr_calls)
164 (*nr_calls)++;
165
147 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 166 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
148 break; 167 break;
149 nb = next_nb; 168 nb = next_nb;
169 nr_to_call--;
150 } 170 }
151 return ret; 171 return ret;
152} 172}
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
205EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); 225EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
206 226
207/** 227/**
208 * atomic_notifier_call_chain - Call functions in an atomic notifier chain 228 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
209 * @nh: Pointer to head of the atomic notifier chain 229 * @nh: Pointer to head of the atomic notifier chain
210 * @val: Value passed unmodified to notifier function 230 * @val: Value passed unmodified to notifier function
211 * @v: Pointer passed unmodified to notifier function 231 * @v: Pointer passed unmodified to notifier function
232 * @nr_to_call: See the comment for notifier_call_chain.
233 * @nr_calls: See the comment for notifier_call_chain.
212 * 234 *
213 * Calls each function in a notifier chain in turn. The functions 235 * Calls each function in a notifier chain in turn. The functions
214 * run in an atomic context, so they must not block. 236 * run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
222 * of the last notifier function called. 244 * of the last notifier function called.
223 */ 245 */
224 246
225int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, 247int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
226 unsigned long val, void *v) 248 unsigned long val, void *v,
249 int nr_to_call, int *nr_calls)
227{ 250{
228 int ret; 251 int ret;
229 252
230 rcu_read_lock(); 253 rcu_read_lock();
231 ret = notifier_call_chain(&nh->head, val, v); 254 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
232 rcu_read_unlock(); 255 rcu_read_unlock();
233 return ret; 256 return ret;
234} 257}
235 258
236EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); 259EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
260
261int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
262 unsigned long val, void *v)
263{
264 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
265}
237 266
267EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
238/* 268/*
239 * Blocking notifier chain routines. All access to the chain is 269 * Blocking notifier chain routines. All access to the chain is
240 * synchronized by an rwsem. 270 * synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
304EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); 334EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
305 335
306/** 336/**
307 * blocking_notifier_call_chain - Call functions in a blocking notifier chain 337 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
308 * @nh: Pointer to head of the blocking notifier chain 338 * @nh: Pointer to head of the blocking notifier chain
309 * @val: Value passed unmodified to notifier function 339 * @val: Value passed unmodified to notifier function
310 * @v: Pointer passed unmodified to notifier function 340 * @v: Pointer passed unmodified to notifier function
341 * @nr_to_call: See comment for notifier_call_chain.
342 * @nr_calls: See comment for notifier_call_chain.
311 * 343 *
312 * Calls each function in a notifier chain in turn. The functions 344 * Calls each function in a notifier chain in turn. The functions
313 * run in a process context, so they are allowed to block. 345 * run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
320 * of the last notifier function called. 352 * of the last notifier function called.
321 */ 353 */
322 354
323int blocking_notifier_call_chain(struct blocking_notifier_head *nh, 355int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
324 unsigned long val, void *v) 356 unsigned long val, void *v,
357 int nr_to_call, int *nr_calls)
325{ 358{
326 int ret = NOTIFY_DONE; 359 int ret = NOTIFY_DONE;
327 360
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
332 */ 365 */
333 if (rcu_dereference(nh->head)) { 366 if (rcu_dereference(nh->head)) {
334 down_read(&nh->rwsem); 367 down_read(&nh->rwsem);
335 ret = notifier_call_chain(&nh->head, val, v); 368 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
369 nr_calls);
336 up_read(&nh->rwsem); 370 up_read(&nh->rwsem);
337 } 371 }
338 return ret; 372 return ret;
339} 373}
374EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
340 375
376int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
377 unsigned long val, void *v)
378{
379 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
380}
341EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); 381EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
342 382
343/* 383/*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
383EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); 423EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
384 424
385/** 425/**
386 * raw_notifier_call_chain - Call functions in a raw notifier chain 426 * __raw_notifier_call_chain - Call functions in a raw notifier chain
387 * @nh: Pointer to head of the raw notifier chain 427 * @nh: Pointer to head of the raw notifier chain
388 * @val: Value passed unmodified to notifier function 428 * @val: Value passed unmodified to notifier function
389 * @v: Pointer passed unmodified to notifier function 429 * @v: Pointer passed unmodified to notifier function
430 * @nr_to_call: See comment for notifier_call_chain.
431 * @nr_calls: See comment for notifier_call_chain
390 * 432 *
391 * Calls each function in a notifier chain in turn. The functions 433 * Calls each function in a notifier chain in turn. The functions
392 * run in an undefined context. 434 * run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
400 * of the last notifier function called. 442 * of the last notifier function called.
401 */ 443 */
402 444
445int __raw_notifier_call_chain(struct raw_notifier_head *nh,
446 unsigned long val, void *v,
447 int nr_to_call, int *nr_calls)
448{
449 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
450}
451
452EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
453
403int raw_notifier_call_chain(struct raw_notifier_head *nh, 454int raw_notifier_call_chain(struct raw_notifier_head *nh,
404 unsigned long val, void *v) 455 unsigned long val, void *v)
405{ 456{
406 return notifier_call_chain(&nh->head, val, v); 457 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
407} 458}
408 459
409EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 460EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
478EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); 529EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
479 530
480/** 531/**
481 * srcu_notifier_call_chain - Call functions in an SRCU notifier chain 532 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
482 * @nh: Pointer to head of the SRCU notifier chain 533 * @nh: Pointer to head of the SRCU notifier chain
483 * @val: Value passed unmodified to notifier function 534 * @val: Value passed unmodified to notifier function
484 * @v: Pointer passed unmodified to notifier function 535 * @v: Pointer passed unmodified to notifier function
536 * @nr_to_call: See comment for notifier_call_chain.
537 * @nr_calls: See comment for notifier_call_chain
485 * 538 *
486 * Calls each function in a notifier chain in turn. The functions 539 * Calls each function in a notifier chain in turn. The functions
487 * run in a process context, so they are allowed to block. 540 * run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
494 * of the last notifier function called. 547 * of the last notifier function called.
495 */ 548 */
496 549
497int srcu_notifier_call_chain(struct srcu_notifier_head *nh, 550int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
498 unsigned long val, void *v) 551 unsigned long val, void *v,
552 int nr_to_call, int *nr_calls)
499{ 553{
500 int ret; 554 int ret;
501 int idx; 555 int idx;
502 556
503 idx = srcu_read_lock(&nh->srcu); 557 idx = srcu_read_lock(&nh->srcu);
504 ret = notifier_call_chain(&nh->head, val, v); 558 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
505 srcu_read_unlock(&nh->srcu, idx); 559 srcu_read_unlock(&nh->srcu, idx);
506 return ret; 560 return ret;
507} 561}
562EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
508 563
564int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
565 unsigned long val, void *v)
566{
567 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
568}
509EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); 569EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
510 570
511/** 571/**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
881#ifdef CONFIG_SOFTWARE_SUSPEND 941#ifdef CONFIG_SOFTWARE_SUSPEND
882 case LINUX_REBOOT_CMD_SW_SUSPEND: 942 case LINUX_REBOOT_CMD_SW_SUSPEND:
883 { 943 {
884 int ret = pm_suspend(PM_SUSPEND_DISK); 944 int ret = hibernate();
885 unlock_kernel(); 945 unlock_kernel();
886 return ret; 946 return ret;
887 } 947 }
@@ -1292,7 +1352,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
1292} 1352}
1293 1353
1294/* 1354/*
1295 * Samma på svenska.. 1355 * Samma pÃ¥ svenska..
1296 */ 1356 */
1297asmlinkage long sys_setfsgid(gid_t gid) 1357asmlinkage long sys_setfsgid(gid_t gid)
1298{ 1358{
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011c..4073353abd4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
77extern int percpu_pagelist_fraction; 77extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect; 79extern int maps_protect;
80extern int sysctl_stat_interval;
80 81
81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
82static int maxolduid = 65535; 83static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
857 .extra2 = &one_hundred, 858 .extra2 = &one_hundred,
858 }, 859 },
859#endif 860#endif
861#ifdef CONFIG_SMP
862 {
863 .ctl_name = CTL_UNNUMBERED,
864 .procname = "stat_interval",
865 .data = &sysctl_stat_interval,
866 .maxlen = sizeof(sysctl_stat_interval),
867 .mode = 0644,
868 .proc_handler = &proc_dointvec_jiffies,
869 .strategy = &sysctl_jiffies,
870 },
871#endif
860#if defined(CONFIG_X86_32) || \ 872#if defined(CONFIG_X86_32) || \
861 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 873 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
862 { 874 {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db24247..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
74static struct timer_list watchdog_timer; 74static struct timer_list watchdog_timer;
75static DEFINE_SPINLOCK(watchdog_lock); 75static DEFINE_SPINLOCK(watchdog_lock);
76static cycle_t watchdog_last; 76static cycle_t watchdog_last;
77static int watchdog_resumed;
78
77/* 79/*
78 * Interval: 0.5sec Treshold: 0.0625s 80 * Interval: 0.5sec Threshold: 0.0625s
79 */ 81 */
80#define WATCHDOG_INTERVAL (HZ >> 1) 82#define WATCHDOG_INTERVAL (HZ >> 1)
81#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) 83#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
82 84
83static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 85static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
84{ 86{
85 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) 87 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
86 return; 88 return;
87 89
88 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 90 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
98 struct clocksource *cs, *tmp; 100 struct clocksource *cs, *tmp;
99 cycle_t csnow, wdnow; 101 cycle_t csnow, wdnow;
100 int64_t wd_nsec, cs_nsec; 102 int64_t wd_nsec, cs_nsec;
103 int resumed;
101 104
102 spin_lock(&watchdog_lock); 105 spin_lock(&watchdog_lock);
103 106
107 resumed = watchdog_resumed;
108 if (unlikely(resumed))
109 watchdog_resumed = 0;
110
104 wdnow = watchdog->read(); 111 wdnow = watchdog->read();
105 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 112 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
106 watchdog_last = wdnow; 113 watchdog_last = wdnow;
107 114
108 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 115 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
109 csnow = cs->read(); 116 csnow = cs->read();
117
118 if (unlikely(resumed)) {
119 cs->wd_last = csnow;
120 continue;
121 }
122
110 /* Initialized ? */ 123 /* Initialized ? */
111 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 124 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
112 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 125 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
136 } 149 }
137 spin_unlock(&watchdog_lock); 150 spin_unlock(&watchdog_lock);
138} 151}
152static void clocksource_resume_watchdog(void)
153{
154 spin_lock(&watchdog_lock);
155 watchdog_resumed = 1;
156 spin_unlock(&watchdog_lock);
157}
158
139static void clocksource_check_watchdog(struct clocksource *cs) 159static void clocksource_check_watchdog(struct clocksource *cs)
140{ 160{
141 struct clocksource *cse; 161 struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
182 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 202 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
183 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 203 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
184} 204}
205
206static inline void clocksource_resume_watchdog(void) { }
185#endif 207#endif
186 208
187/** 209/**
210 * clocksource_resume - resume the clocksource(s)
211 */
212void clocksource_resume(void)
213{
214 struct list_head *tmp;
215 unsigned long flags;
216
217 spin_lock_irqsave(&clocksource_lock, flags);
218
219 list_for_each(tmp, &clocksource_list) {
220 struct clocksource *cs;
221
222 cs = list_entry(tmp, struct clocksource, list);
223 if (cs->resume)
224 cs->resume();
225 }
226
227 clocksource_resume_watchdog();
228
229 spin_unlock_irqrestore(&clocksource_lock, flags);
230}
231
232/**
188 * clocksource_get_next - Returns the selected clocksource 233 * clocksource_get_next - Returns the selected clocksource
189 * 234 *
190 */ 235 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75e..8bbcfb77f7d2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 66#endif
67 SEQ_printf(m, "\n"); 67 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", 68 SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 69 (unsigned long long)ktime_to_ns(timer->expires),
70 (unsigned long long)(ktime_to_ns(timer->expires) - now)); 70 (unsigned long long)(ktime_to_ns(timer->expires) - now));
71} 71}
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 111{
112 SEQ_printf(m, " .index: %d\n", 112 SEQ_printf(m, " .index: %d\n",
113 base->index); 113 base->index);
114 SEQ_printf(m, " .resolution: %Ld nsecs\n", 114 SEQ_printf(m, " .resolution: %Lu nsecs\n",
115 (unsigned long long)ktime_to_ns(base->resolution)); 115 (unsigned long long)ktime_to_ns(base->resolution));
116 SEQ_printf(m, " .get_time: "); 116 SEQ_printf(m, " .get_time: ");
117 print_name_offset(m, base->get_time); 117 print_name_offset(m, base->get_time);
118 SEQ_printf(m, "\n"); 118 SEQ_printf(m, "\n");
119#ifdef CONFIG_HIGH_RES_TIMERS 119#ifdef CONFIG_HIGH_RES_TIMERS
120 SEQ_printf(m, " .offset: %Ld nsecs\n", 120 SEQ_printf(m, " .offset: %Lu nsecs\n",
121 ktime_to_ns(base->offset)); 121 (unsigned long long) ktime_to_ns(base->offset));
122#endif 122#endif
123 SEQ_printf(m, "active timers:\n"); 123 SEQ_printf(m, "active timers:\n");
124 print_active_timers(m, base, now); 124 print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
135 print_base(m, cpu_base->clock_base + i, now); 135 print_base(m, cpu_base->clock_base + i, now);
136 } 136 }
137#define P(x) \ 137#define P(x) \
138 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) 138 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
139 (unsigned long long)(cpu_base->x))
139#define P_ns(x) \ 140#define P_ns(x) \
140 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 141 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
141 (u64)(ktime_to_ns(cpu_base->x))) 142 (unsigned long long)(ktime_to_ns(cpu_base->x)))
142 143
143#ifdef CONFIG_HIGH_RES_TIMERS 144#ifdef CONFIG_HIGH_RES_TIMERS
144 P_ns(expires_next); 145 P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 151
151#ifdef CONFIG_TICK_ONESHOT 152#ifdef CONFIG_TICK_ONESHOT
152# define P(x) \ 153# define P(x) \
153 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) 154 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
155 (unsigned long long)(ts->x))
154# define P_ns(x) \ 156# define P_ns(x) \
155 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 157 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
156 (u64)(ktime_to_ns(ts->x))) 158 (unsigned long long)(ktime_to_ns(ts->x)))
157 { 159 {
158 struct tick_sched *ts = tick_get_tick_sched(cpu); 160 struct tick_sched *ts = tick_get_tick_sched(cpu);
159 P(nohz_mode); 161 P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 P(last_jiffies); 169 P(last_jiffies);
168 P(next_jiffies); 170 P(next_jiffies);
169 P_ns(idle_expires); 171 P_ns(idle_expires);
170 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); 172 SEQ_printf(m, "jiffies: %Lu\n",
173 (unsigned long long)jiffies);
171 } 174 }
172#endif 175#endif
173 176
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f90..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
92/* Functions below help us manage 'deferrable' flag */ 92/* Functions below help us manage 'deferrable' flag */
93static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 93static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
94{ 94{
95 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 95 return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
96} 96}
97 97
98static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 98static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
99{ 99{
100 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 100 return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
101} 101}
102 102
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = (tvec_base_t *)((unsigned long)timer->base |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG);
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)new_base |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1293 long cpu = (long)hcpu; 1293 long cpu = (long)hcpu;
1294 switch(action) { 1294 switch(action) {
1295 case CPU_UP_PREPARE: 1295 case CPU_UP_PREPARE:
1296 case CPU_UP_PREPARE_FROZEN:
1296 if (init_timers_cpu(cpu) < 0) 1297 if (init_timers_cpu(cpu) < 0)
1297 return NOTIFY_BAD; 1298 return NOTIFY_BAD;
1298 break; 1299 break;
1299#ifdef CONFIG_HOTPLUG_CPU 1300#ifdef CONFIG_HOTPLUG_CPU
1300 case CPU_DEAD: 1301 case CPU_DEAD:
1302 case CPU_DEAD_FROZEN:
1301 migrate_timers(cpu); 1303 migrate_timers(cpu);
1302 break; 1304 break;
1303#endif 1305#endif
@@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
1497 prev = &curr->next; 1499 prev = &curr->next;
1498 } 1500 }
1499 1501
1502 clocksource_resume();
1503
1500 write_seqlock_irqsave(&xtime_lock, flags); 1504 write_seqlock_irqsave(&xtime_lock, flags);
1501 if (ti == time_interpolator) { 1505 if (ti == time_interpolator) {
1502 /* we lost the best time-interpolator: */ 1506 /* we lost the best time-interpolator: */
diff --git a/kernel/wait.c b/kernel/wait.c
index 59a82f63275d..444ddbfaefc4 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -61,7 +61,7 @@ EXPORT_SYMBOL(remove_wait_queue);
61 * The spin_unlock() itself is semi-permeable and only protects 61 * The spin_unlock() itself is semi-permeable and only protects
62 * one way (it only protects stuff inside the critical region and 62 * one way (it only protects stuff inside the critical region and
63 * stops them from bleeding out - it would still allow subsequent 63 * stops them from bleeding out - it would still allow subsequent
64 * loads to move into the the critical region). 64 * loads to move into the critical region).
65 */ 65 */
66void fastcall 66void fastcall
67prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) 67prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085d..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
36/* 36/*
37 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
38 * possible cpu). 38 * possible cpu).
39 *
40 * The sequence counters are for flush_scheduled_work(). It wants to wait
41 * until all currently-scheduled works are completed, but it doesn't
42 * want to be livelocked by new, incoming ones. So it waits until
43 * remove_sequence is >= the insert_sequence which pertained when
44 * flush_scheduled_work() was called.
45 */ 39 */
46struct cpu_workqueue_struct { 40struct cpu_workqueue_struct {
47 41
48 spinlock_t lock; 42 spinlock_t lock;
49 43
50 long remove_sequence; /* Least-recently added (next to run) */
51 long insert_sequence; /* Next to add */
52
53 struct list_head worklist; 44 struct list_head worklist;
54 wait_queue_head_t more_work; 45 wait_queue_head_t more_work;
55 wait_queue_head_t work_done; 46 struct work_struct *current_work;
56 47
57 struct workqueue_struct *wq; 48 struct workqueue_struct *wq;
58 struct task_struct *thread; 49 struct task_struct *thread;
50 int should_stop;
59 51
60 int run_depth; /* Detect run_workqueue() recursion depth */ 52 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
63} ____cacheline_aligned; 53} ____cacheline_aligned;
64 54
65/* 55/*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
68 */ 58 */
69struct workqueue_struct { 59struct workqueue_struct {
70 struct cpu_workqueue_struct *cpu_wq; 60 struct cpu_workqueue_struct *cpu_wq;
61 struct list_head list;
71 const char *name; 62 const char *name;
72 struct list_head list; /* Empty if single thread */ 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */
73}; 65};
74 66
75/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 67/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
77static DEFINE_MUTEX(workqueue_mutex); 69static DEFINE_MUTEX(workqueue_mutex);
78static LIST_HEAD(workqueues); 70static LIST_HEAD(workqueues);
79 71
80static int singlethread_cpu; 72static int singlethread_cpu __read_mostly;
73static cpumask_t cpu_singlethread_map __read_mostly;
74/* optimization, we could use cpu_possible_map */
75static cpumask_t cpu_populated_map __read_mostly;
81 76
82/* If it's single threaded, it isn't in the list of workqueues. */ 77/* If it's single threaded, it isn't in the list of workqueues. */
83static inline int is_single_threaded(struct workqueue_struct *wq) 78static inline int is_single_threaded(struct workqueue_struct *wq)
84{ 79{
85 return list_empty(&wq->list); 80 return wq->singlethread;
81}
82
83static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
84{
85 return is_single_threaded(wq)
86 ? &cpu_singlethread_map : &cpu_populated_map;
87}
88
89static
90struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
91{
92 if (unlikely(is_single_threaded(wq)))
93 cpu = singlethread_cpu;
94 return per_cpu_ptr(wq->cpu_wq, cpu);
86} 95}
87 96
88/* 97/*
89 * Set the workqueue on which a work item is to be run 98 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set 99 * - Must *only* be called if the pending flag is set
91 */ 100 */
92static inline void set_wq_data(struct work_struct *work, void *wq) 101static inline void set_wq_data(struct work_struct *work,
102 struct cpu_workqueue_struct *cwq)
93{ 103{
94 unsigned long new; 104 unsigned long new;
95 105
96 BUG_ON(!work_pending(work)); 106 BUG_ON(!work_pending(work));
97 107
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); 108 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
99 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 109 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
100 atomic_long_set(&work->data, new); 110 atomic_long_set(&work->data, new);
101} 111}
102 112
103static inline void *get_wq_data(struct work_struct *work) 113static inline
114struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
104{ 115{
105 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 116 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
106} 117}
107 118
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) 119static void insert_work(struct cpu_workqueue_struct *cwq,
120 struct work_struct *work, int tail)
109{ 121{
110 int ret = 0; 122 set_wq_data(work, cwq);
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /* 123 /*
115 * We need to re-validate the work info after we've gotten 124 * Ensure that we get the right work->data if we see the
116 * the cpu_workqueue lock. We can run the work now iff: 125 * result of list_add() below, see try_to_grab_pending().
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */ 126 */
129 if (get_wq_data(work) == cwq 127 smp_wmb();
130 && work_pending(work) 128 if (tail)
131 && !list_empty(&work->entry)) { 129 list_add_tail(&work->entry, &cwq->worklist);
132 work_func_t f = work->func; 130 else
133 list_del_init(&work->entry); 131 list_add(&work->entry, &cwq->worklist);
134 spin_unlock_irqrestore(&cwq->lock, flags); 132 wake_up(&cwq->more_work);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178} 133}
179EXPORT_SYMBOL(run_scheduled_work);
180 134
181/* Preempt must be disabled. */ 135/* Preempt must be disabled. */
182static void __queue_work(struct cpu_workqueue_struct *cwq, 136static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
185 unsigned long flags; 139 unsigned long flags;
186 140
187 spin_lock_irqsave(&cwq->lock, flags); 141 spin_lock_irqsave(&cwq->lock, flags);
188 set_wq_data(work, cwq); 142 insert_work(cwq, work, 1);
189 list_add_tail(&work->entry, &cwq->worklist);
190 cwq->insert_sequence++;
191 wake_up(&cwq->more_work);
192 spin_unlock_irqrestore(&cwq->lock, flags); 143 spin_unlock_irqrestore(&cwq->lock, flags);
193} 144}
194 145
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
204 */ 155 */
205int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) 156int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
206{ 157{
207 int ret = 0, cpu = get_cpu(); 158 int ret = 0;
208 159
209 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 160 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
210 if (unlikely(is_single_threaded(wq)))
211 cpu = singlethread_cpu;
212 BUG_ON(!list_empty(&work->entry)); 161 BUG_ON(!list_empty(&work->entry));
213 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 162 __queue_work(wq_per_cpu(wq, get_cpu()), work);
163 put_cpu();
214 ret = 1; 164 ret = 1;
215 } 165 }
216 put_cpu();
217 return ret; 166 return ret;
218} 167}
219EXPORT_SYMBOL_GPL(queue_work); 168EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
221void delayed_work_timer_fn(unsigned long __data) 170void delayed_work_timer_fn(unsigned long __data)
222{ 171{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 172 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 173 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
225 int cpu = smp_processor_id(); 174 struct workqueue_struct *wq = cwq->wq;
226 175
227 if (unlikely(is_single_threaded(wq))) 176 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
228 cpu = singlethread_cpu;
229
230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
231} 177}
232 178
233/** 179/**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
241int fastcall queue_delayed_work(struct workqueue_struct *wq, 187int fastcall queue_delayed_work(struct workqueue_struct *wq,
242 struct delayed_work *dwork, unsigned long delay) 188 struct delayed_work *dwork, unsigned long delay)
243{ 189{
244 int ret = 0; 190 timer_stats_timer_set_start_info(&dwork->timer);
245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
247
248 timer_stats_timer_set_start_info(timer);
249 if (delay == 0) 191 if (delay == 0)
250 return queue_work(wq, work); 192 return queue_work(wq, &dwork->work);
251
252 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
253 BUG_ON(timer_pending(timer));
254 BUG_ON(!list_empty(&work->entry));
255 193
256 /* This stores wq for the moment, for the timer_fn */ 194 return queue_delayed_work_on(-1, wq, dwork, delay);
257 set_wq_data(work, wq);
258 timer->expires = jiffies + delay;
259 timer->data = (unsigned long)dwork;
260 timer->function = delayed_work_timer_fn;
261 add_timer(timer);
262 ret = 1;
263 }
264 return ret;
265} 195}
266EXPORT_SYMBOL_GPL(queue_delayed_work); 196EXPORT_SYMBOL_GPL(queue_delayed_work);
267 197
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
285 BUG_ON(timer_pending(timer)); 215 BUG_ON(timer_pending(timer));
286 BUG_ON(!list_empty(&work->entry)); 216 BUG_ON(!list_empty(&work->entry));
287 217
288 /* This stores wq for the moment, for the timer_fn */ 218 /* This stores cwq for the moment, for the timer_fn */
289 set_wq_data(work, wq); 219 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
290 timer->expires = jiffies + delay; 220 timer->expires = jiffies + delay;
291 timer->data = (unsigned long)dwork; 221 timer->data = (unsigned long)dwork;
292 timer->function = delayed_work_timer_fn; 222 timer->function = delayed_work_timer_fn;
293 add_timer_on(timer, cpu); 223
224 if (unlikely(cpu >= 0))
225 add_timer_on(timer, cpu);
226 else
227 add_timer(timer);
294 ret = 1; 228 ret = 1;
295 } 229 }
296 return ret; 230 return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
299 233
300static void run_workqueue(struct cpu_workqueue_struct *cwq) 234static void run_workqueue(struct cpu_workqueue_struct *cwq)
301{ 235{
302 unsigned long flags; 236 spin_lock_irq(&cwq->lock);
303
304 /*
305 * Keep taking off work from the queue until
306 * done.
307 */
308 spin_lock_irqsave(&cwq->lock, flags);
309 cwq->run_depth++; 237 cwq->run_depth++;
310 if (cwq->run_depth > 3) { 238 if (cwq->run_depth > 3) {
311 /* morton gets to eat his hat */ 239 /* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
318 struct work_struct, entry); 246 struct work_struct, entry);
319 work_func_t f = work->func; 247 work_func_t f = work->func;
320 248
249 cwq->current_work = work;
321 list_del_init(cwq->worklist.next); 250 list_del_init(cwq->worklist.next);
322 spin_unlock_irqrestore(&cwq->lock, flags); 251 spin_unlock_irq(&cwq->lock);
323 252
324 BUG_ON(get_wq_data(work) != cwq); 253 BUG_ON(get_wq_data(work) != cwq);
325 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) 254 work_clear_pending(work);
326 work_release(work);
327 f(work); 255 f(work);
328 256
329 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 257 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
337 dump_stack(); 265 dump_stack();
338 } 266 }
339 267
340 spin_lock_irqsave(&cwq->lock, flags); 268 spin_lock_irq(&cwq->lock);
341 cwq->remove_sequence++; 269 cwq->current_work = NULL;
342 wake_up(&cwq->work_done);
343 } 270 }
344 cwq->run_depth--; 271 cwq->run_depth--;
345 spin_unlock_irqrestore(&cwq->lock, flags); 272 spin_unlock_irq(&cwq->lock);
273}
274
275/*
276 * NOTE: the caller must not touch *cwq if this func returns true
277 */
278static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
279{
280 int should_stop = cwq->should_stop;
281
282 if (unlikely(should_stop)) {
283 spin_lock_irq(&cwq->lock);
284 should_stop = cwq->should_stop && list_empty(&cwq->worklist);
285 if (should_stop)
286 cwq->thread = NULL;
287 spin_unlock_irq(&cwq->lock);
288 }
289
290 return should_stop;
346} 291}
347 292
348static int worker_thread(void *__cwq) 293static int worker_thread(void *__cwq)
349{ 294{
350 struct cpu_workqueue_struct *cwq = __cwq; 295 struct cpu_workqueue_struct *cwq = __cwq;
351 DECLARE_WAITQUEUE(wait, current); 296 DEFINE_WAIT(wait);
352 struct k_sigaction sa;
353 sigset_t blocked;
354 297
355 if (!cwq->freezeable) 298 if (!cwq->wq->freezeable)
356 current->flags |= PF_NOFREEZE; 299 current->flags |= PF_NOFREEZE;
357 300
358 set_user_nice(current, -5); 301 set_user_nice(current, -5);
359 302
360 /* Block and flush all signals */ 303 for (;;) {
361 sigfillset(&blocked); 304 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
362 sigprocmask(SIG_BLOCK, &blocked, NULL); 305 if (!freezing(current) && !cwq->should_stop
363 flush_signals(current); 306 && list_empty(&cwq->worklist))
364 307 schedule();
365 /* 308 finish_wait(&cwq->more_work, &wait);
366 * We inherited MPOL_INTERLEAVE from the booting kernel.
367 * Set MPOL_DEFAULT to insure node local allocations.
368 */
369 numa_default_policy();
370
371 /* SIG_IGN makes children autoreap: see do_notify_parent(). */
372 sa.sa.sa_handler = SIG_IGN;
373 sa.sa.sa_flags = 0;
374 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
375 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
376 309
377 set_current_state(TASK_INTERRUPTIBLE); 310 try_to_freeze();
378 while (!kthread_should_stop()) {
379 if (cwq->freezeable)
380 try_to_freeze();
381 311
382 add_wait_queue(&cwq->more_work, &wait); 312 if (cwq_should_stop(cwq))
383 if (list_empty(&cwq->worklist)) 313 break;
384 schedule();
385 else
386 __set_current_state(TASK_RUNNING);
387 remove_wait_queue(&cwq->more_work, &wait);
388 314
389 if (!list_empty(&cwq->worklist)) 315 run_workqueue(cwq);
390 run_workqueue(cwq);
391 set_current_state(TASK_INTERRUPTIBLE);
392 } 316 }
393 __set_current_state(TASK_RUNNING); 317
394 return 0; 318 return 0;
395} 319}
396 320
321struct wq_barrier {
322 struct work_struct work;
323 struct completion done;
324};
325
326static void wq_barrier_func(struct work_struct *work)
327{
328 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
329 complete(&barr->done);
330}
331
332static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
333 struct wq_barrier *barr, int tail)
334{
335 INIT_WORK(&barr->work, wq_barrier_func);
336 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
337
338 init_completion(&barr->done);
339
340 insert_work(cwq, &barr->work, tail);
341}
342
397static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 343static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
398{ 344{
399 if (cwq->thread == current) { 345 if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
403 */ 349 */
404 run_workqueue(cwq); 350 run_workqueue(cwq);
405 } else { 351 } else {
406 DEFINE_WAIT(wait); 352 struct wq_barrier barr;
407 long sequence_needed; 353 int active = 0;
408 354
409 spin_lock_irq(&cwq->lock); 355 spin_lock_irq(&cwq->lock);
410 sequence_needed = cwq->insert_sequence; 356 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
411 357 insert_wq_barrier(cwq, &barr, 1);
412 while (sequence_needed - cwq->remove_sequence > 0) { 358 active = 1;
413 prepare_to_wait(&cwq->work_done, &wait,
414 TASK_UNINTERRUPTIBLE);
415 spin_unlock_irq(&cwq->lock);
416 schedule();
417 spin_lock_irq(&cwq->lock);
418 } 359 }
419 finish_wait(&cwq->work_done, &wait);
420 spin_unlock_irq(&cwq->lock); 360 spin_unlock_irq(&cwq->lock);
361
362 if (active)
363 wait_for_completion(&barr.done);
421 } 364 }
422} 365}
423 366
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
428 * Forces execution of the workqueue and blocks until its completion. 371 * Forces execution of the workqueue and blocks until its completion.
429 * This is typically used in driver shutdown handlers. 372 * This is typically used in driver shutdown handlers.
430 * 373 *
431 * This function will sample each workqueue's current insert_sequence number and 374 * We sleep until all works which were queued on entry have been handled,
432 * will sleep until the head sequence is greater than or equal to that. This 375 * but we are not livelocked by new incoming ones.
433 * means that we sleep until all works which were queued on entry have been
434 * handled, but we are not livelocked by new incoming ones.
435 * 376 *
436 * This function used to run the workqueues itself. Now we just wait for the 377 * This function used to run the workqueues itself. Now we just wait for the
437 * helper threads to do it. 378 * helper threads to do it.
438 */ 379 */
439void fastcall flush_workqueue(struct workqueue_struct *wq) 380void fastcall flush_workqueue(struct workqueue_struct *wq)
440{ 381{
382 const cpumask_t *cpu_map = wq_cpu_map(wq);
383 int cpu;
384
441 might_sleep(); 385 might_sleep();
386 for_each_cpu_mask(cpu, *cpu_map)
387 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
388}
389EXPORT_SYMBOL_GPL(flush_workqueue);
442 390
443 if (is_single_threaded(wq)) { 391/*
444 /* Always use first cpu's area. */ 392 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
445 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); 393 * so this work can't be re-armed in any way.
446 } else { 394 */
447 int cpu; 395static int try_to_grab_pending(struct work_struct *work)
396{
397 struct cpu_workqueue_struct *cwq;
398 int ret = 0;
448 399
449 mutex_lock(&workqueue_mutex); 400 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
450 for_each_online_cpu(cpu) 401 return 1;
451 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 402
452 mutex_unlock(&workqueue_mutex); 403 /*
404 * The queueing is in progress, or it is already queued. Try to
405 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
406 */
407
408 cwq = get_wq_data(work);
409 if (!cwq)
410 return ret;
411
412 spin_lock_irq(&cwq->lock);
413 if (!list_empty(&work->entry)) {
414 /*
415 * This work is queued, but perhaps we locked the wrong cwq.
416 * In that case we must see the new value after rmb(), see
417 * insert_work()->wmb().
418 */
419 smp_rmb();
420 if (cwq == get_wq_data(work)) {
421 list_del_init(&work->entry);
422 ret = 1;
423 }
453 } 424 }
425 spin_unlock_irq(&cwq->lock);
426
427 return ret;
454} 428}
455EXPORT_SYMBOL_GPL(flush_workqueue);
456 429
457static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 430static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
458 int cpu, int freezeable) 431 struct work_struct *work)
459{ 432{
460 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 433 struct wq_barrier barr;
461 struct task_struct *p; 434 int running = 0;
462 435
463 spin_lock_init(&cwq->lock); 436 spin_lock_irq(&cwq->lock);
464 cwq->wq = wq; 437 if (unlikely(cwq->current_work == work)) {
465 cwq->thread = NULL; 438 insert_wq_barrier(cwq, &barr, 0);
466 cwq->insert_sequence = 0; 439 running = 1;
467 cwq->remove_sequence = 0; 440 }
468 cwq->freezeable = freezeable; 441 spin_unlock_irq(&cwq->lock);
469 INIT_LIST_HEAD(&cwq->worklist);
470 init_waitqueue_head(&cwq->more_work);
471 init_waitqueue_head(&cwq->work_done);
472 442
473 if (is_single_threaded(wq)) 443 if (unlikely(running))
474 p = kthread_create(worker_thread, cwq, "%s", wq->name); 444 wait_for_completion(&barr.done);
475 else
476 p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
477 if (IS_ERR(p))
478 return NULL;
479 cwq->thread = p;
480 return p;
481} 445}
482 446
483struct workqueue_struct *__create_workqueue(const char *name, 447static void wait_on_work(struct work_struct *work)
484 int singlethread, int freezeable)
485{ 448{
486 int cpu, destroy = 0; 449 struct cpu_workqueue_struct *cwq;
487 struct workqueue_struct *wq; 450 struct workqueue_struct *wq;
488 struct task_struct *p; 451 const cpumask_t *cpu_map;
452 int cpu;
489 453
490 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 454 might_sleep();
491 if (!wq)
492 return NULL;
493 455
494 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 456 cwq = get_wq_data(work);
495 if (!wq->cpu_wq) { 457 if (!cwq)
496 kfree(wq); 458 return;
497 return NULL;
498 }
499 459
500 wq->name = name; 460 wq = cwq->wq;
501 mutex_lock(&workqueue_mutex); 461 cpu_map = wq_cpu_map(wq);
502 if (singlethread) {
503 INIT_LIST_HEAD(&wq->list);
504 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
505 if (!p)
506 destroy = 1;
507 else
508 wake_up_process(p);
509 } else {
510 list_add(&wq->list, &workqueues);
511 for_each_online_cpu(cpu) {
512 p = create_workqueue_thread(wq, cpu, freezeable);
513 if (p) {
514 kthread_bind(p, cpu);
515 wake_up_process(p);
516 } else
517 destroy = 1;
518 }
519 }
520 mutex_unlock(&workqueue_mutex);
521 462
522 /* 463 for_each_cpu_mask(cpu, *cpu_map)
523 * Was there any error during startup? If yes then clean up: 464 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
524 */
525 if (destroy) {
526 destroy_workqueue(wq);
527 wq = NULL;
528 }
529 return wq;
530} 465}
531EXPORT_SYMBOL_GPL(__create_workqueue);
532 466
533static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) 467/**
468 * cancel_work_sync - block until a work_struct's callback has terminated
469 * @work: the work which is to be flushed
470 *
471 * cancel_work_sync() will cancel the work if it is queued. If the work's
472 * callback appears to be running, cancel_work_sync() will block until it
473 * has completed.
474 *
475 * It is possible to use this function if the work re-queues itself. It can
476 * cancel the work even if it migrates to another workqueue, however in that
477 * case it only guarantees that work->func() has completed on the last queued
478 * workqueue.
479 *
480 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
481 * pending, otherwise it goes into a busy-wait loop until the timer expires.
482 *
483 * The caller must ensure that workqueue_struct on which this work was last
484 * queued can't be destroyed before this function returns.
485 */
486void cancel_work_sync(struct work_struct *work)
534{ 487{
535 struct cpu_workqueue_struct *cwq; 488 while (!try_to_grab_pending(work))
536 unsigned long flags; 489 cpu_relax();
537 struct task_struct *p; 490 wait_on_work(work);
538 491 work_clear_pending(work);
539 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
540 spin_lock_irqsave(&cwq->lock, flags);
541 p = cwq->thread;
542 cwq->thread = NULL;
543 spin_unlock_irqrestore(&cwq->lock, flags);
544 if (p)
545 kthread_stop(p);
546} 492}
493EXPORT_SYMBOL_GPL(cancel_work_sync);
547 494
548/** 495/**
549 * destroy_workqueue - safely terminate a workqueue 496 * cancel_rearming_delayed_work - reliably kill off a delayed work.
550 * @wq: target workqueue 497 * @dwork: the delayed work struct
551 * 498 *
552 * Safely destroy a workqueue. All work currently pending will be done first. 499 * It is possible to use this function if @dwork rearms itself via queue_work()
500 * or queue_delayed_work(). See also the comment for cancel_work_sync().
553 */ 501 */
554void destroy_workqueue(struct workqueue_struct *wq) 502void cancel_rearming_delayed_work(struct delayed_work *dwork)
555{ 503{
556 int cpu; 504 while (!del_timer(&dwork->timer) &&
557 505 !try_to_grab_pending(&dwork->work))
558 flush_workqueue(wq); 506 cpu_relax();
559 507 wait_on_work(&dwork->work);
560 /* We don't need the distraction of CPUs appearing and vanishing. */ 508 work_clear_pending(&dwork->work);
561 mutex_lock(&workqueue_mutex);
562 if (is_single_threaded(wq))
563 cleanup_workqueue_thread(wq, singlethread_cpu);
564 else {
565 for_each_online_cpu(cpu)
566 cleanup_workqueue_thread(wq, cpu);
567 list_del(&wq->list);
568 }
569 mutex_unlock(&workqueue_mutex);
570 free_percpu(wq->cpu_wq);
571 kfree(wq);
572} 509}
573EXPORT_SYMBOL_GPL(destroy_workqueue); 510EXPORT_SYMBOL(cancel_rearming_delayed_work);
574 511
575static struct workqueue_struct *keventd_wq; 512static struct workqueue_struct *keventd_wq __read_mostly;
576 513
577/** 514/**
578 * schedule_work - put work task in global workqueue 515 * schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
638 if (!works) 575 if (!works)
639 return -ENOMEM; 576 return -ENOMEM;
640 577
641 mutex_lock(&workqueue_mutex); 578 preempt_disable(); /* CPU hotplug */
642 for_each_online_cpu(cpu) { 579 for_each_online_cpu(cpu) {
643 struct work_struct *work = per_cpu_ptr(works, cpu); 580 struct work_struct *work = per_cpu_ptr(works, cpu);
644 581
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
646 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 583 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
647 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 584 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
648 } 585 }
649 mutex_unlock(&workqueue_mutex); 586 preempt_enable();
650 flush_workqueue(keventd_wq); 587 flush_workqueue(keventd_wq);
651 free_percpu(works); 588 free_percpu(works);
652 return 0; 589 return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
659EXPORT_SYMBOL(flush_scheduled_work); 596EXPORT_SYMBOL(flush_scheduled_work);
660 597
661/** 598/**
662 * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
663 * @wq: the controlling workqueue structure
664 * @dwork: the delayed work struct
665 */
666void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
667 struct delayed_work *dwork)
668{
669 while (!cancel_delayed_work(dwork))
670 flush_workqueue(wq);
671}
672EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
673
674/**
675 * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
676 * @dwork: the delayed work struct
677 */
678void cancel_rearming_delayed_work(struct delayed_work *dwork)
679{
680 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
681}
682EXPORT_SYMBOL(cancel_rearming_delayed_work);
683
684/**
685 * execute_in_process_context - reliably execute the routine with user context 599 * execute_in_process_context - reliably execute the routine with user context
686 * @fn: the function to execute 600 * @fn: the function to execute
687 * @ew: guaranteed storage for the execute work structure (must 601 * @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
728 642
729} 643}
730 644
731/* Take the work from this (downed) CPU. */ 645static struct cpu_workqueue_struct *
732static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 646init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
733{ 647{
734 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 648 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
735 struct list_head list;
736 struct work_struct *work;
737 649
738 spin_lock_irq(&cwq->lock); 650 cwq->wq = wq;
739 list_replace_init(&cwq->worklist, &list); 651 spin_lock_init(&cwq->lock);
652 INIT_LIST_HEAD(&cwq->worklist);
653 init_waitqueue_head(&cwq->more_work);
654
655 return cwq;
656}
657
658static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
659{
660 struct workqueue_struct *wq = cwq->wq;
661 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
662 struct task_struct *p;
663
664 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
665 /*
666 * Nobody can add the work_struct to this cwq,
667 * if (caller is __create_workqueue)
668 * nobody should see this wq
669 * else // caller is CPU_UP_PREPARE
670 * cpu is not on cpu_online_map
671 * so we can abort safely.
672 */
673 if (IS_ERR(p))
674 return PTR_ERR(p);
675
676 cwq->thread = p;
677 cwq->should_stop = 0;
678
679 return 0;
680}
681
682static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
683{
684 struct task_struct *p = cwq->thread;
740 685
741 while (!list_empty(&list)) { 686 if (p != NULL) {
742 printk("Taking work for %s\n", wq->name); 687 if (cpu >= 0)
743 work = list_entry(list.next,struct work_struct,entry); 688 kthread_bind(p, cpu);
744 list_del(&work->entry); 689 wake_up_process(p);
745 __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
746 } 690 }
747 spin_unlock_irq(&cwq->lock);
748} 691}
749 692
750/* We're holding the cpucontrol mutex here */ 693struct workqueue_struct *__create_workqueue(const char *name,
751static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 694 int singlethread, int freezeable)
752 unsigned long action,
753 void *hcpu)
754{ 695{
755 unsigned int hotcpu = (unsigned long)hcpu;
756 struct workqueue_struct *wq; 696 struct workqueue_struct *wq;
697 struct cpu_workqueue_struct *cwq;
698 int err = 0, cpu;
757 699
758 switch (action) { 700 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
759 case CPU_UP_PREPARE: 701 if (!wq)
760 mutex_lock(&workqueue_mutex); 702 return NULL;
761 /* Create a new workqueue thread for it. */
762 list_for_each_entry(wq, &workqueues, list) {
763 if (!create_workqueue_thread(wq, hotcpu, 0)) {
764 printk("workqueue for %i failed\n", hotcpu);
765 return NOTIFY_BAD;
766 }
767 }
768 break;
769 703
770 case CPU_ONLINE: 704 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
771 /* Kick off worker threads. */ 705 if (!wq->cpu_wq) {
772 list_for_each_entry(wq, &workqueues, list) { 706 kfree(wq);
773 struct cpu_workqueue_struct *cwq; 707 return NULL;
708 }
774 709
775 cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); 710 wq->name = name;
776 kthread_bind(cwq->thread, hotcpu); 711 wq->singlethread = singlethread;
777 wake_up_process(cwq->thread); 712 wq->freezeable = freezeable;
778 } 713 INIT_LIST_HEAD(&wq->list);
779 mutex_unlock(&workqueue_mutex);
780 break;
781 714
782 case CPU_UP_CANCELED: 715 if (singlethread) {
783 list_for_each_entry(wq, &workqueues, list) { 716 cwq = init_cpu_workqueue(wq, singlethread_cpu);
784 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) 717 err = create_workqueue_thread(cwq, singlethread_cpu);
718 start_workqueue_thread(cwq, -1);
719 } else {
720 mutex_lock(&workqueue_mutex);
721 list_add(&wq->list, &workqueues);
722
723 for_each_possible_cpu(cpu) {
724 cwq = init_cpu_workqueue(wq, cpu);
725 if (err || !cpu_online(cpu))
785 continue; 726 continue;
786 /* Unbind so it can run. */ 727 err = create_workqueue_thread(cwq, cpu);
787 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 728 start_workqueue_thread(cwq, cpu);
788 any_online_cpu(cpu_online_map));
789 cleanup_workqueue_thread(wq, hotcpu);
790 } 729 }
791 mutex_unlock(&workqueue_mutex); 730 mutex_unlock(&workqueue_mutex);
792 break; 731 }
732
733 if (err) {
734 destroy_workqueue(wq);
735 wq = NULL;
736 }
737 return wq;
738}
739EXPORT_SYMBOL_GPL(__create_workqueue);
740
741static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
742{
743 struct wq_barrier barr;
744 int alive = 0;
745
746 spin_lock_irq(&cwq->lock);
747 if (cwq->thread != NULL) {
748 insert_wq_barrier(cwq, &barr, 1);
749 cwq->should_stop = 1;
750 alive = 1;
751 }
752 spin_unlock_irq(&cwq->lock);
753
754 if (alive) {
755 wait_for_completion(&barr.done);
793 756
794 case CPU_DOWN_PREPARE: 757 while (unlikely(cwq->thread != NULL))
758 cpu_relax();
759 /*
760 * Wait until cwq->thread unlocks cwq->lock,
761 * it won't touch *cwq after that.
762 */
763 smp_rmb();
764 spin_unlock_wait(&cwq->lock);
765 }
766}
767
768/**
769 * destroy_workqueue - safely terminate a workqueue
770 * @wq: target workqueue
771 *
772 * Safely destroy a workqueue. All work currently pending will be done first.
773 */
774void destroy_workqueue(struct workqueue_struct *wq)
775{
776 const cpumask_t *cpu_map = wq_cpu_map(wq);
777 struct cpu_workqueue_struct *cwq;
778 int cpu;
779
780 mutex_lock(&workqueue_mutex);
781 list_del(&wq->list);
782 mutex_unlock(&workqueue_mutex);
783
784 for_each_cpu_mask(cpu, *cpu_map) {
785 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
786 cleanup_workqueue_thread(cwq, cpu);
787 }
788
789 free_percpu(wq->cpu_wq);
790 kfree(wq);
791}
792EXPORT_SYMBOL_GPL(destroy_workqueue);
793
794static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
795 unsigned long action,
796 void *hcpu)
797{
798 unsigned int cpu = (unsigned long)hcpu;
799 struct cpu_workqueue_struct *cwq;
800 struct workqueue_struct *wq;
801
802 action &= ~CPU_TASKS_FROZEN;
803
804 switch (action) {
805 case CPU_LOCK_ACQUIRE:
795 mutex_lock(&workqueue_mutex); 806 mutex_lock(&workqueue_mutex);
796 break; 807 return NOTIFY_OK;
797 808
798 case CPU_DOWN_FAILED: 809 case CPU_LOCK_RELEASE:
799 mutex_unlock(&workqueue_mutex); 810 mutex_unlock(&workqueue_mutex);
800 break; 811 return NOTIFY_OK;
801 812
802 case CPU_DEAD: 813 case CPU_UP_PREPARE:
803 list_for_each_entry(wq, &workqueues, list) 814 cpu_set(cpu, cpu_populated_map);
804 cleanup_workqueue_thread(wq, hotcpu); 815 }
805 list_for_each_entry(wq, &workqueues, list) 816
806 take_over_work(wq, hotcpu); 817 list_for_each_entry(wq, &workqueues, list) {
807 mutex_unlock(&workqueue_mutex); 818 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
808 break; 819
820 switch (action) {
821 case CPU_UP_PREPARE:
822 if (!create_workqueue_thread(cwq, cpu))
823 break;
824 printk(KERN_ERR "workqueue for %i failed\n", cpu);
825 return NOTIFY_BAD;
826
827 case CPU_ONLINE:
828 start_workqueue_thread(cwq, cpu);
829 break;
830
831 case CPU_UP_CANCELED:
832 start_workqueue_thread(cwq, -1);
833 case CPU_DEAD:
834 cleanup_workqueue_thread(cwq, cpu);
835 break;
836 }
809 } 837 }
810 838
811 return NOTIFY_OK; 839 return NOTIFY_OK;
812} 840}
813 841
814void init_workqueues(void) 842void __init init_workqueues(void)
815{ 843{
844 cpu_populated_map = cpu_online_map;
816 singlethread_cpu = first_cpu(cpu_possible_map); 845 singlethread_cpu = first_cpu(cpu_possible_map);
846 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
817 hotcpu_notifier(workqueue_cpu_callback, 0); 847 hotcpu_notifier(workqueue_cpu_callback, 0);
818 keventd_wq = create_workqueue("events"); 848 keventd_wq = create_workqueue("events");
819 BUG_ON(!keventd_wq); 849 BUG_ON(!keventd_wq);
820} 850}
821