aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/configs.c15
-rw-r--r--kernel/cpu.c66
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/exit.c18
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/futex.c988
-rw-r--r--kernel/futex_compat.c22
-rw-r--r--kernel/hrtimer.c2
-rw-r--r--kernel/irq/handle.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/kthread.c113
-rw-r--r--kernel/mutex.c8
-rw-r--r--kernel/power/disk.c195
-rw-r--r--kernel/power/main.c42
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/user.c13
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcupdate.c2
-rw-r--r--kernel/relay.c37
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
-rw-r--r--kernel/sched.c38
-rw-r--r--kernel/signal.c140
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c96
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/time/clocksource.c51
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/timer.c14
-rw-r--r--kernel/workqueue.c783
32 files changed, 1702 insertions, 1092 deletions
diff --git a/kernel/configs.c b/kernel/configs.c
index 8fa1fb28f8a7..e84d3f9c6c7b 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -61,18 +61,9 @@ static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 61ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 62 size_t len, loff_t * offset)
63{ 63{
64 loff_t pos = *offset; 64 return simple_read_from_buffer(buf, len, offset,
65 ssize_t count; 65 kernel_config_data + MAGIC_SIZE,
66 66 kernel_config_data_size);
67 if (pos >= kernel_config_data_size)
68 return 0;
69
70 count = min(len, (size_t)(kernel_config_data_size - pos));
71 if (copy_to_user(buf, kernel_config_data + MAGIC_SIZE + pos, count))
72 return -EFAULT;
73
74 *offset += count;
75 return count;
76} 67}
77 68
78static const struct file_operations ikconfig_file_ops = { 69static const struct file_operations ikconfig_file_ops = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 36e70845cfc3..208cf3497c10 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -97,7 +97,7 @@ static inline void check_for_tasks(int cpu)
97 (!cputime_eq(p->utime, cputime_zero) || 97 (!cputime_eq(p->utime, cputime_zero) ||
98 !cputime_eq(p->stime, cputime_zero))) 98 !cputime_eq(p->stime, cputime_zero)))
99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 99 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
100 (state = %ld, flags = %lx) \n", 100 (state = %ld, flags = %x) \n",
101 p->comm, p->pid, cpu, p->state, p->flags); 101 p->comm, p->pid, cpu, p->state, p->flags);
102 } 102 }
103 write_unlock_irq(&tasklist_lock); 103 write_unlock_irq(&tasklist_lock);
@@ -120,11 +120,13 @@ static int take_cpu_down(void *unused)
120} 120}
121 121
122/* Requires cpu_add_remove_lock to be held */ 122/* Requires cpu_add_remove_lock to be held */
123static int _cpu_down(unsigned int cpu) 123static int _cpu_down(unsigned int cpu, int tasks_frozen)
124{ 124{
125 int err; 125 int err, nr_calls = 0;
126 struct task_struct *p; 126 struct task_struct *p;
127 cpumask_t old_allowed, tmp; 127 cpumask_t old_allowed, tmp;
128 void *hcpu = (void *)(long)cpu;
129 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
128 130
129 if (num_online_cpus() == 1) 131 if (num_online_cpus() == 1)
130 return -EBUSY; 132 return -EBUSY;
@@ -132,12 +134,16 @@ static int _cpu_down(unsigned int cpu)
132 if (!cpu_online(cpu)) 134 if (!cpu_online(cpu))
133 return -EINVAL; 135 return -EINVAL;
134 136
135 err = raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 137 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
136 (void *)(long)cpu); 138 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
139 hcpu, -1, &nr_calls);
137 if (err == NOTIFY_BAD) { 140 if (err == NOTIFY_BAD) {
141 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
142 hcpu, nr_calls, NULL);
138 printk("%s: attempt to take down CPU %u failed\n", 143 printk("%s: attempt to take down CPU %u failed\n",
139 __FUNCTION__, cpu); 144 __FUNCTION__, cpu);
140 return -EINVAL; 145 err = -EINVAL;
146 goto out_release;
141 } 147 }
142 148
143 /* Ensure that we are not runnable on dying cpu */ 149 /* Ensure that we are not runnable on dying cpu */
@@ -152,8 +158,8 @@ static int _cpu_down(unsigned int cpu)
152 158
153 if (IS_ERR(p) || cpu_online(cpu)) { 159 if (IS_ERR(p) || cpu_online(cpu)) {
154 /* CPU didn't die: tell everyone. Can't complain. */ 160 /* CPU didn't die: tell everyone. Can't complain. */
155 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 161 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
156 (void *)(long)cpu) == NOTIFY_BAD) 162 hcpu) == NOTIFY_BAD)
157 BUG(); 163 BUG();
158 164
159 if (IS_ERR(p)) { 165 if (IS_ERR(p)) {
@@ -170,13 +176,9 @@ static int _cpu_down(unsigned int cpu)
170 /* This actually kills the CPU. */ 176 /* This actually kills the CPU. */
171 __cpu_die(cpu); 177 __cpu_die(cpu);
172 178
173 /* Move it here so it can run. */
174 kthread_bind(p, get_cpu());
175 put_cpu();
176
177 /* CPU is completely dead: tell everyone. Too late to complain. */ 179 /* CPU is completely dead: tell everyone. Too late to complain. */
178 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD, 180 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
179 (void *)(long)cpu) == NOTIFY_BAD) 181 hcpu) == NOTIFY_BAD)
180 BUG(); 182 BUG();
181 183
182 check_for_tasks(cpu); 184 check_for_tasks(cpu);
@@ -185,6 +187,8 @@ out_thread:
185 err = kthread_stop(p); 187 err = kthread_stop(p);
186out_allowed: 188out_allowed:
187 set_cpus_allowed(current, old_allowed); 189 set_cpus_allowed(current, old_allowed);
190out_release:
191 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
188 return err; 192 return err;
189} 193}
190 194
@@ -196,7 +200,7 @@ int cpu_down(unsigned int cpu)
196 if (cpu_hotplug_disabled) 200 if (cpu_hotplug_disabled)
197 err = -EBUSY; 201 err = -EBUSY;
198 else 202 else
199 err = _cpu_down(cpu); 203 err = _cpu_down(cpu, 0);
200 204
201 mutex_unlock(&cpu_add_remove_lock); 205 mutex_unlock(&cpu_add_remove_lock);
202 return err; 206 return err;
@@ -204,15 +208,18 @@ int cpu_down(unsigned int cpu)
204#endif /*CONFIG_HOTPLUG_CPU*/ 208#endif /*CONFIG_HOTPLUG_CPU*/
205 209
206/* Requires cpu_add_remove_lock to be held */ 210/* Requires cpu_add_remove_lock to be held */
207static int __cpuinit _cpu_up(unsigned int cpu) 211static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
208{ 212{
209 int ret; 213 int ret, nr_calls = 0;
210 void *hcpu = (void *)(long)cpu; 214 void *hcpu = (void *)(long)cpu;
215 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
211 216
212 if (cpu_online(cpu) || !cpu_present(cpu)) 217 if (cpu_online(cpu) || !cpu_present(cpu))
213 return -EINVAL; 218 return -EINVAL;
214 219
215 ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 220 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
221 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
222 -1, &nr_calls);
216 if (ret == NOTIFY_BAD) { 223 if (ret == NOTIFY_BAD) {
217 printk("%s: attempt to bring up CPU %u failed\n", 224 printk("%s: attempt to bring up CPU %u failed\n",
218 __FUNCTION__, cpu); 225 __FUNCTION__, cpu);
@@ -229,12 +236,13 @@ static int __cpuinit _cpu_up(unsigned int cpu)
229 BUG_ON(!cpu_online(cpu)); 236 BUG_ON(!cpu_online(cpu));
230 237
231 /* Now call notifier in preparation. */ 238 /* Now call notifier in preparation. */
232 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu); 239 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
233 240
234out_notify: 241out_notify:
235 if (ret != 0) 242 if (ret != 0)
236 raw_notifier_call_chain(&cpu_chain, 243 __raw_notifier_call_chain(&cpu_chain,
237 CPU_UP_CANCELED, hcpu); 244 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
245 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
238 246
239 return ret; 247 return ret;
240} 248}
@@ -247,19 +255,13 @@ int __cpuinit cpu_up(unsigned int cpu)
247 if (cpu_hotplug_disabled) 255 if (cpu_hotplug_disabled)
248 err = -EBUSY; 256 err = -EBUSY;
249 else 257 else
250 err = _cpu_up(cpu); 258 err = _cpu_up(cpu, 0);
251 259
252 mutex_unlock(&cpu_add_remove_lock); 260 mutex_unlock(&cpu_add_remove_lock);
253 return err; 261 return err;
254} 262}
255 263
256#ifdef CONFIG_SUSPEND_SMP 264#ifdef CONFIG_SUSPEND_SMP
257/* Needed to prevent the microcode driver from requesting firmware in its CPU
258 * hotplug notifier during the suspend/resume.
259 */
260int suspend_cpu_hotplug;
261EXPORT_SYMBOL(suspend_cpu_hotplug);
262
263static cpumask_t frozen_cpus; 265static cpumask_t frozen_cpus;
264 266
265int disable_nonboot_cpus(void) 267int disable_nonboot_cpus(void)
@@ -267,7 +269,6 @@ int disable_nonboot_cpus(void)
267 int cpu, first_cpu, error = 0; 269 int cpu, first_cpu, error = 0;
268 270
269 mutex_lock(&cpu_add_remove_lock); 271 mutex_lock(&cpu_add_remove_lock);
270 suspend_cpu_hotplug = 1;
271 first_cpu = first_cpu(cpu_online_map); 272 first_cpu = first_cpu(cpu_online_map);
272 /* We take down all of the non-boot CPUs in one shot to avoid races 273 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time 274 * with the userspace trying to use the CPU hotplug at the same time
@@ -277,7 +278,7 @@ int disable_nonboot_cpus(void)
277 for_each_online_cpu(cpu) { 278 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu) 279 if (cpu == first_cpu)
279 continue; 280 continue;
280 error = _cpu_down(cpu); 281 error = _cpu_down(cpu, 1);
281 if (!error) { 282 if (!error) {
282 cpu_set(cpu, frozen_cpus); 283 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu); 284 printk("CPU%d is down\n", cpu);
@@ -294,7 +295,6 @@ int disable_nonboot_cpus(void)
294 } else { 295 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 296 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
296 } 297 }
297 suspend_cpu_hotplug = 0;
298 mutex_unlock(&cpu_add_remove_lock); 298 mutex_unlock(&cpu_add_remove_lock);
299 return error; 299 return error;
300} 300}
@@ -309,10 +309,9 @@ void enable_nonboot_cpus(void)
309 if (cpus_empty(frozen_cpus)) 309 if (cpus_empty(frozen_cpus))
310 goto out; 310 goto out;
311 311
312 suspend_cpu_hotplug = 1;
313 printk("Enabling non-boot CPUs ...\n"); 312 printk("Enabling non-boot CPUs ...\n");
314 for_each_cpu_mask(cpu, frozen_cpus) { 313 for_each_cpu_mask(cpu, frozen_cpus) {
315 error = _cpu_up(cpu); 314 error = _cpu_up(cpu, 1);
316 if (!error) { 315 if (!error) {
317 printk("CPU%d is up\n", cpu); 316 printk("CPU%d is up\n", cpu);
318 continue; 317 continue;
@@ -320,7 +319,6 @@ void enable_nonboot_cpus(void)
320 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 319 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
321 } 320 }
322 cpus_clear(frozen_cpus); 321 cpus_clear(frozen_cpus);
323 suspend_cpu_hotplug = 0;
324out: 322out:
325 mutex_unlock(&cpu_add_remove_lock); 323 mutex_unlock(&cpu_add_remove_lock);
326} 324}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 88b416dfbc72..f57854b08922 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1772,12 +1772,7 @@ static ssize_t cpuset_tasks_read(struct file *file, char __user *buf,
1772{ 1772{
1773 struct ctr_struct *ctr = file->private_data; 1773 struct ctr_struct *ctr = file->private_data;
1774 1774
1775 if (*ppos + nbytes > ctr->bufsz) 1775 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
1776 nbytes = ctr->bufsz - *ppos;
1777 if (copy_to_user(buf, ctr->buf + *ppos, nbytes))
1778 return -EFAULT;
1779 *ppos += nbytes;
1780 return nbytes;
1781} 1776}
1782 1777
1783static int cpuset_tasks_release(struct inode *unused_inode, struct file *file) 1778static int cpuset_tasks_release(struct inode *unused_inode, struct file *file)
diff --git a/kernel/exit.c b/kernel/exit.c
index f5a7abb621f3..b0c6f0c3a2df 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -26,6 +26,7 @@
26#include <linux/profile.h> 26#include <linux/profile.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
29#include <linux/kthread.h>
29#include <linux/mempolicy.h> 30#include <linux/mempolicy.h>
30#include <linux/taskstats_kern.h> 31#include <linux/taskstats_kern.h>
31#include <linux/delayacct.h> 32#include <linux/delayacct.h>
@@ -254,26 +255,25 @@ static int has_stopped_jobs(struct pid *pgrp)
254} 255}
255 256
256/** 257/**
257 * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. 258 * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
258 * 259 *
259 * If a kernel thread is launched as a result of a system call, or if 260 * If a kernel thread is launched as a result of a system call, or if
260 * it ever exits, it should generally reparent itself to init so that 261 * it ever exits, it should generally reparent itself to kthreadd so it
261 * it is correctly cleaned up on exit. 262 * isn't in the way of other processes and is correctly cleaned up on exit.
262 * 263 *
263 * The various task state such as scheduling policy and priority may have 264 * The various task state such as scheduling policy and priority may have
264 * been inherited from a user process, so we reset them to sane values here. 265 * been inherited from a user process, so we reset them to sane values here.
265 * 266 *
266 * NOTE that reparent_to_init() gives the caller full capabilities. 267 * NOTE that reparent_to_kthreadd() gives the caller full capabilities.
267 */ 268 */
268static void reparent_to_init(void) 269static void reparent_to_kthreadd(void)
269{ 270{
270 write_lock_irq(&tasklist_lock); 271 write_lock_irq(&tasklist_lock);
271 272
272 ptrace_unlink(current); 273 ptrace_unlink(current);
273 /* Reparent to init */ 274 /* Reparent to init */
274 remove_parent(current); 275 remove_parent(current);
275 current->parent = child_reaper(current); 276 current->real_parent = current->parent = kthreadd_task;
276 current->real_parent = child_reaper(current);
277 add_parent(current); 277 add_parent(current);
278 278
279 /* Set the exit signal to SIGCHLD so we signal init on exit */ 279 /* Set the exit signal to SIGCHLD so we signal init on exit */
@@ -347,7 +347,7 @@ int disallow_signal(int sig)
347 return -EINVAL; 347 return -EINVAL;
348 348
349 spin_lock_irq(&current->sighand->siglock); 349 spin_lock_irq(&current->sighand->siglock);
350 sigaddset(&current->blocked, sig); 350 current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
351 recalc_sigpending(); 351 recalc_sigpending();
352 spin_unlock_irq(&current->sighand->siglock); 352 spin_unlock_irq(&current->sighand->siglock);
353 return 0; 353 return 0;
@@ -400,7 +400,7 @@ void daemonize(const char *name, ...)
400 current->files = init_task.files; 400 current->files = init_task.files;
401 atomic_inc(&current->files->count); 401 atomic_inc(&current->files->count);
402 402
403 reparent_to_init(); 403 reparent_to_kthreadd();
404} 404}
405 405
406EXPORT_SYMBOL(daemonize); 406EXPORT_SYMBOL(daemonize);
diff --git a/kernel/fork.c b/kernel/fork.c
index a8dd75d4992b..5dd3979747f5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,7 +105,7 @@ static struct kmem_cache *mm_cachep;
105 105
106void free_task(struct task_struct *tsk) 106void free_task(struct task_struct *tsk)
107{ 107{
108 free_thread_info(tsk->thread_info); 108 free_thread_info(tsk->stack);
109 rt_mutex_debug_task_free(tsk); 109 rt_mutex_debug_task_free(tsk);
110 free_task_struct(tsk); 110 free_task_struct(tsk);
111} 111}
@@ -175,7 +175,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
175 } 175 }
176 176
177 *tsk = *orig; 177 *tsk = *orig;
178 tsk->thread_info = ti; 178 tsk->stack = ti;
179 setup_thread_stack(tsk, orig); 179 setup_thread_stack(tsk, orig);
180 180
181#ifdef CONFIG_CC_STACKPROTECTOR 181#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
53 56
54#include "rtmutex_common.h" 57#include "rtmutex_common.h"
55 58
59#ifdef CONFIG_DEBUG_RT_MUTEXES
60# include "rtmutex-debug.h"
61#else
62# include "rtmutex.h"
63#endif
64
56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
57 66
58/* 67/*
@@ -81,12 +90,12 @@ struct futex_pi_state {
81 * we can wake only the relevant ones (hashed queues may be shared). 90 * we can wake only the relevant ones (hashed queues may be shared).
82 * 91 *
83 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
84 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
85 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
86 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiters, then make the second condition true.
87 */ 96 */
88struct futex_q { 97struct futex_q {
89 struct list_head list; 98 struct plist_node list;
90 wait_queue_head_t waiters; 99 wait_queue_head_t waiters;
91 100
92 /* Which hash list lock to use: */ 101 /* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
102 /* Optional priority inheritance state: */ 111 /* Optional priority inheritance state: */
103 struct futex_pi_state *pi_state; 112 struct futex_pi_state *pi_state;
104 struct task_struct *task; 113 struct task_struct *task;
114
115 /*
116 * This waiter is used in case of requeue from a
117 * normal futex to a PI-futex
118 */
119 struct rt_mutex_waiter waiter;
105}; 120};
106 121
107/* 122/*
108 * Split the global futex_lock into every hash list lock. 123 * Split the global futex_lock into every hash list lock.
109 */ 124 */
110struct futex_hash_bucket { 125struct futex_hash_bucket {
111 spinlock_t lock; 126 spinlock_t lock;
112 struct list_head chain; 127 struct plist_head chain;
113}; 128};
114 129
115static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 130static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
138 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
139} 154}
140 155
141/* 156/**
142 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
143 * 165 *
144 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
145 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
146 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
147 * 169 *
148 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
149 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
150 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
151 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
152 */ 173 */
153int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
154{ 176{
155 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
156 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
162 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
163 */ 185 */
164 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
165 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
166 return -EINVAL; 188 return -EINVAL;
167 address -= key->both.offset; 189 address -= key->both.offset;
168 190
169 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
170 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
171 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
172 */ 208 */
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
180 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 216 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
181 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 217 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
182 218
219 /* Save the user address in the ley */
220 key->uaddr = uaddr;
221
183 /* 222 /*
184 * Private mappings are handled in a simple way. 223 * Private mappings are handled in a simple way.
185 * 224 *
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
190 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
191 */ 230 */
192 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
193 key->private.mm = mm; 233 key->private.mm = mm;
194 key->private.address = address; 234 key->private.address = address;
195 return 0; 235 return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
199 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
200 */ 240 */
201 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
202 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
203 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
204 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
205 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
227 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
228 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
229 * 269 *
230 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
231 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
232 */ 270 */
233inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
234{ 272{
235 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
236 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
237 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
238 else 278 break;
279 case FUT_OFF_MMSHARED:
239 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
240 } 282 }
241} 283}
242EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
247 */ 289 */
248void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
249{ 291{
250 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
252 iput(key->shared.inode); 296 iput(key->shared.inode);
253 else 297 break;
298 case FUT_OFF_MMSHARED:
254 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
255 } 301 }
256} 302}
257EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
268} 314}
269 315
270/* 316/*
271 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
272 */ 319 */
273static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
274{ 322{
275 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
276 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
277 326
278 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
279 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
280 return -EFAULT;
281 329
282 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
283 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
284 current->min_flt++; 332 vma = find_vma(mm, address);
285 break; 333 if (vma && address >= vma->vm_start &&
286 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
287 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
288 break; 336 case VM_FAULT_MINOR:
289 default: 337 ret = 0;
290 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
291 } 345 }
292 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
293} 349}
294 350
295/* 351/*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
439} 495}
440 496
441static int 497static int
442lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) 498lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
499 union futex_key *key, struct futex_pi_state **ps)
443{ 500{
444 struct futex_pi_state *pi_state = NULL; 501 struct futex_pi_state *pi_state = NULL;
445 struct futex_q *this, *next; 502 struct futex_q *this, *next;
446 struct list_head *head; 503 struct plist_head *head;
447 struct task_struct *p; 504 struct task_struct *p;
448 pid_t pid; 505 pid_t pid;
449 506
450 head = &hb->chain; 507 head = &hb->chain;
451 508
452 list_for_each_entry_safe(this, next, head, list) { 509 plist_for_each_entry_safe(this, next, head, list) {
453 if (match_futex(&this->key, &me->key)) { 510 if (match_futex(&this->key, key)) {
454 /* 511 /*
455 * Another waiter already exists - bump up 512 * Another waiter already exists - bump up
456 * the refcount and return its pi_state: 513 * the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
465 WARN_ON(!atomic_read(&pi_state->refcount)); 522 WARN_ON(!atomic_read(&pi_state->refcount));
466 523
467 atomic_inc(&pi_state->refcount); 524 atomic_inc(&pi_state->refcount);
468 me->pi_state = pi_state; 525 *ps = pi_state;
469 526
470 return 0; 527 return 0;
471 } 528 }
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
492 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 549 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
493 550
494 /* Store the key for possible exit cleanups: */ 551 /* Store the key for possible exit cleanups: */
495 pi_state->key = me->key; 552 pi_state->key = *key;
496 553
497 spin_lock_irq(&p->pi_lock); 554 spin_lock_irq(&p->pi_lock);
498 WARN_ON(!list_empty(&pi_state->list)); 555 WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
502 559
503 put_task_struct(p); 560 put_task_struct(p);
504 561
505 me->pi_state = pi_state; 562 *ps = pi_state;
506 563
507 return 0; 564 return 0;
508} 565}
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
513 */ 570 */
514static void wake_futex(struct futex_q *q) 571static void wake_futex(struct futex_q *q)
515{ 572{
516 list_del_init(&q->list); 573 plist_del(&q->list, &q->list.plist);
517 if (q->filp) 574 if (q->filp)
518 send_sigio(&q->filp->f_owner, q->fd, POLL_IN); 575 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
519 /* 576 /*
520 * The lock in wake_up_all() is a crucial memory barrier after the 577 * The lock in wake_up_all() is a crucial memory barrier after the
521 * list_del_init() and also before assigning to q->lock_ptr. 578 * plist_del() and also before assigning to q->lock_ptr.
522 */ 579 */
523 wake_up_all(&q->waiters); 580 wake_up_all(&q->waiters);
524 /* 581 /*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
562 */ 619 */
563 if (!(uval & FUTEX_OWNER_DIED)) { 620 if (!(uval & FUTEX_OWNER_DIED)) {
564 newval = FUTEX_WAITERS | new_owner->pid; 621 newval = FUTEX_WAITERS | new_owner->pid;
622 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
623 newval |= (uval & FUTEX_WAITER_REQUEUED);
565 624
566 pagefault_disable(); 625 pagefault_disable();
567 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
629 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
630 * to this virtual address: 689 * to this virtual address:
631 */ 690 */
632static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
633{ 693{
634 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
635 struct futex_q *this, *next; 695 struct futex_q *this, *next;
636 struct list_head *head; 696 struct plist_head *head;
637 union futex_key key; 697 union futex_key key;
638 int ret; 698 int ret;
639 699
640 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
641 702
642 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
643 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
644 goto out; 705 goto out;
645 706
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
647 spin_lock(&hb->lock); 708 spin_lock(&hb->lock);
648 head = &hb->chain; 709 head = &hb->chain;
649 710
650 list_for_each_entry_safe(this, next, head, list) { 711 plist_for_each_entry_safe(this, next, head, list) {
651 if (match_futex (&this->key, &key)) { 712 if (match_futex (&this->key, &key)) {
652 if (this->pi_state) { 713 if (this->pi_state) {
653 ret = -EINVAL; 714 ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
661 722
662 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
663out: 724out:
664 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
727 return ret;
728}
729
730/*
731 * Called from futex_requeue_pi.
732 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
733 * PI-futex value; search its associated pi_state if an owner exist
734 * or create a new one without owner.
735 */
736static inline int
737lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
738 union futex_key *key,
739 struct futex_pi_state **pi_state)
740{
741 u32 curval, uval, newval;
742
743retry:
744 /*
745 * We can't handle a fault cleanly because we can't
746 * release the locks here. Simply return the fault.
747 */
748 if (get_futex_value_locked(&curval, uaddr))
749 return -EFAULT;
750
751 /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
752 if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
753 != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
754 /*
755 * No waiters yet, we prepare the futex to have some waiters.
756 */
757
758 uval = curval;
759 newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
760
761 pagefault_disable();
762 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
763 pagefault_enable();
764
765 if (unlikely(curval == -EFAULT))
766 return -EFAULT;
767 if (unlikely(curval != uval))
768 goto retry;
769 }
770
771 if (!(curval & FUTEX_TID_MASK)
772 || lookup_pi_state(curval, hb, key, pi_state)) {
773 /* the futex has no owner (yet) or the lookup failed:
774 allocate one pi_state without owner */
775
776 *pi_state = alloc_pi_state();
777
778 /* Already stores the key: */
779 (*pi_state)->key = *key;
780
781 /* init the mutex without owner */
782 __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
783 }
784
785 return 0;
786}
787
788/*
789 * Keep the first nr_wake waiter from futex1, wake up one,
790 * and requeue the next nr_requeue waiters following hashed on
791 * one physical page to another physical page (PI-futex uaddr2)
792 */
793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
796 int nr_wake, int nr_requeue, u32 *cmpval)
797{
798 union futex_key key1, key2;
799 struct futex_hash_bucket *hb1, *hb2;
800 struct plist_head *head1;
801 struct futex_q *this, *next;
802 struct futex_pi_state *pi_state2 = NULL;
803 struct rt_mutex_waiter *waiter, *top_waiter = NULL;
804 struct rt_mutex *lock2 = NULL;
805 int ret, drop_count = 0;
806
807 if (refill_pi_state_cache())
808 return -ENOMEM;
809
810retry:
811 /*
812 * First take all the futex related locks:
813 */
814 if (fshared)
815 down_read(fshared);
816
817 ret = get_futex_key(uaddr1, fshared, &key1);
818 if (unlikely(ret != 0))
819 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2);
821 if (unlikely(ret != 0))
822 goto out;
823
824 hb1 = hash_futex(&key1);
825 hb2 = hash_futex(&key2);
826
827 double_lock_hb(hb1, hb2);
828
829 if (likely(cmpval != NULL)) {
830 u32 curval;
831
832 ret = get_futex_value_locked(&curval, uaddr1);
833
834 if (unlikely(ret)) {
835 spin_unlock(&hb1->lock);
836 if (hb1 != hb2)
837 spin_unlock(&hb2->lock);
838
839 /*
840 * If we would have faulted, release mmap_sem, fault
841 * it in and start all over again.
842 */
843 if (fshared)
844 up_read(fshared);
845
846 ret = get_user(curval, uaddr1);
847
848 if (!ret)
849 goto retry;
850
851 return ret;
852 }
853 if (curval != *cmpval) {
854 ret = -EAGAIN;
855 goto out_unlock;
856 }
857 }
858
859 head1 = &hb1->chain;
860 plist_for_each_entry_safe(this, next, head1, list) {
861 if (!match_futex (&this->key, &key1))
862 continue;
863 if (++ret <= nr_wake) {
864 wake_futex(this);
865 } else {
866 /*
867 * FIRST: get and set the pi_state
868 */
869 if (!pi_state2) {
870 int s;
871 /* do this only the first time we requeue someone */
872 s = lookup_pi_state_for_requeue(uaddr2, hb2,
873 &key2, &pi_state2);
874 if (s) {
875 ret = s;
876 goto out_unlock;
877 }
878
879 lock2 = &pi_state2->pi_mutex;
880 spin_lock(&lock2->wait_lock);
881
882 /* Save the top waiter of the wait_list */
883 if (rt_mutex_has_waiters(lock2))
884 top_waiter = rt_mutex_top_waiter(lock2);
885 } else
886 atomic_inc(&pi_state2->refcount);
887
888
889 this->pi_state = pi_state2;
890
891 /*
892 * SECOND: requeue futex_q to the correct hashbucket
893 */
894
895 /*
896 * If key1 and key2 hash to the same bucket, no need to
897 * requeue.
898 */
899 if (likely(head1 != &hb2->chain)) {
900 plist_del(&this->list, &hb1->chain);
901 plist_add(&this->list, &hb2->chain);
902 this->lock_ptr = &hb2->lock;
903#ifdef CONFIG_DEBUG_PI_LIST
904 this->list.plist.lock = &hb2->lock;
905#endif
906 }
907 this->key = key2;
908 get_futex_key_refs(&key2);
909 drop_count++;
910
911
912 /*
913 * THIRD: queue it to lock2
914 */
915 spin_lock_irq(&this->task->pi_lock);
916 waiter = &this->waiter;
917 waiter->task = this->task;
918 waiter->lock = lock2;
919 plist_node_init(&waiter->list_entry, this->task->prio);
920 plist_node_init(&waiter->pi_list_entry, this->task->prio);
921 plist_add(&waiter->list_entry, &lock2->wait_list);
922 this->task->pi_blocked_on = waiter;
923 spin_unlock_irq(&this->task->pi_lock);
924
925 if (ret - nr_wake >= nr_requeue)
926 break;
927 }
928 }
929
930 /* If we've requeued some tasks and the top_waiter of the rt_mutex
931 has changed, we must adjust the priority of the owner, if any */
932 if (drop_count) {
933 struct task_struct *owner = rt_mutex_owner(lock2);
934 if (owner &&
935 (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
936 int chain_walk = 0;
937
938 spin_lock_irq(&owner->pi_lock);
939 if (top_waiter)
940 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
941 else
942 /*
943 * There was no waiters before the requeue,
944 * the flag must be updated
945 */
946 mark_rt_mutex_waiters(lock2);
947
948 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
949 __rt_mutex_adjust_prio(owner);
950 if (owner->pi_blocked_on) {
951 chain_walk = 1;
952 get_task_struct(owner);
953 }
954
955 spin_unlock_irq(&owner->pi_lock);
956 spin_unlock(&lock2->wait_lock);
957
958 if (chain_walk)
959 rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
960 current);
961 } else {
962 /* No owner or the top_waiter does not change */
963 mark_rt_mutex_waiters(lock2);
964 spin_unlock(&lock2->wait_lock);
965 }
966 }
967
968out_unlock:
969 spin_unlock(&hb1->lock);
970 if (hb1 != hb2)
971 spin_unlock(&hb2->lock);
972
973 /* drop_futex_key_refs() must be called outside the spinlocks. */
974 while (--drop_count >= 0)
975 drop_futex_key_refs(&key1);
976
977out:
978 if (fshared)
979 up_read(fshared);
665 return ret; 980 return ret;
666} 981}
667 982
@@ -670,22 +985,24 @@ out:
670 * to this virtual address: 985 * to this virtual address:
671 */ 986 */
672static int 987static int
673futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
674 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
675{ 991{
676 union futex_key key1, key2; 992 union futex_key key1, key2;
677 struct futex_hash_bucket *hb1, *hb2; 993 struct futex_hash_bucket *hb1, *hb2;
678 struct list_head *head; 994 struct plist_head *head;
679 struct futex_q *this, *next; 995 struct futex_q *this, *next;
680 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
681 997
682retryfull: 998retryfull:
683 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
684 1001
685 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
686 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
687 goto out; 1004 goto out;
688 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
689 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
690 goto out; 1007 goto out;
691 1008
@@ -725,11 +1042,10 @@ retry:
725 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
726 */ 1043 */
727 if (attempt++) { 1044 if (attempt++) {
728 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
729 attempt)) { 1046 fshared, attempt);
730 ret = -EFAULT; 1047 if (ret)
731 goto out; 1048 goto out;
732 }
733 goto retry; 1049 goto retry;
734 } 1050 }
735 1051
@@ -737,7 +1053,8 @@ retry:
737 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
738 * fault it in and start all over again. 1054 * fault it in and start all over again.
739 */ 1055 */
740 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
741 1058
742 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
743 if (ret) 1060 if (ret)
@@ -748,7 +1065,7 @@ retry:
748 1065
749 head = &hb1->chain; 1066 head = &hb1->chain;
750 1067
751 list_for_each_entry_safe(this, next, head, list) { 1068 plist_for_each_entry_safe(this, next, head, list) {
752 if (match_futex (&this->key, &key1)) { 1069 if (match_futex (&this->key, &key1)) {
753 wake_futex(this); 1070 wake_futex(this);
754 if (++ret >= nr_wake) 1071 if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
760 head = &hb2->chain; 1077 head = &hb2->chain;
761 1078
762 op_ret = 0; 1079 op_ret = 0;
763 list_for_each_entry_safe(this, next, head, list) { 1080 plist_for_each_entry_safe(this, next, head, list) {
764 if (match_futex (&this->key, &key2)) { 1081 if (match_futex (&this->key, &key2)) {
765 wake_futex(this); 1082 wake_futex(this);
766 if (++op_ret >= nr_wake2) 1083 if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
774 if (hb1 != hb2) 1091 if (hb1 != hb2)
775 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
776out: 1093out:
777 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
778 return ret; 1096 return ret;
779} 1097}
780 1098
@@ -782,22 +1100,24 @@ out:
782 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
783 * physical page. 1101 * physical page.
784 */ 1102 */
785static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
786 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
787{ 1106{
788 union futex_key key1, key2; 1107 union futex_key key1, key2;
789 struct futex_hash_bucket *hb1, *hb2; 1108 struct futex_hash_bucket *hb1, *hb2;
790 struct list_head *head1; 1109 struct plist_head *head1;
791 struct futex_q *this, *next; 1110 struct futex_q *this, *next;
792 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
793 1112
794 retry: 1113 retry:
795 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
796 1116
797 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
798 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
799 goto out; 1119 goto out;
800 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
801 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
802 goto out; 1122 goto out;
803 1123
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
820 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
821 * it in and start all over again. 1141 * it in and start all over again.
822 */ 1142 */
823 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
824 1145
825 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
826 1147
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
836 } 1157 }
837 1158
838 head1 = &hb1->chain; 1159 head1 = &hb1->chain;
839 list_for_each_entry_safe(this, next, head1, list) { 1160 plist_for_each_entry_safe(this, next, head1, list) {
840 if (!match_futex (&this->key, &key1)) 1161 if (!match_futex (&this->key, &key1))
841 continue; 1162 continue;
842 if (++ret <= nr_wake) { 1163 if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
847 * requeue. 1168 * requeue.
848 */ 1169 */
849 if (likely(head1 != &hb2->chain)) { 1170 if (likely(head1 != &hb2->chain)) {
850 list_move_tail(&this->list, &hb2->chain); 1171 plist_del(&this->list, &hb1->chain);
1172 plist_add(&this->list, &hb2->chain);
851 this->lock_ptr = &hb2->lock; 1173 this->lock_ptr = &hb2->lock;
852 } 1174#ifdef CONFIG_DEBUG_PI_LIST
1175 this->list.plist.lock = &hb2->lock;
1176#endif
1177 }
853 this->key = key2; 1178 this->key = key2;
854 get_futex_key_refs(&key2); 1179 get_futex_key_refs(&key2);
855 drop_count++; 1180 drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
869 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
870 1195
871out: 1196out:
872 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
873 return ret; 1199 return ret;
874} 1200}
875 1201
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
894 1220
895static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1221static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
896{ 1222{
897 list_add_tail(&q->list, &hb->chain); 1223 int prio;
1224
1225 /*
1226 * The priority used to register this element is
1227 * - either the real thread-priority for the real-time threads
1228 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1229 * - or MAX_RT_PRIO for non-RT threads.
1230 * Thus, all RT-threads are woken first in priority order, and
1231 * the others are woken last, in FIFO order.
1232 */
1233 prio = min(current->normal_prio, MAX_RT_PRIO);
1234
1235 plist_node_init(&q->list, prio);
1236#ifdef CONFIG_DEBUG_PI_LIST
1237 q->list.plist.lock = &hb->lock;
1238#endif
1239 plist_add(&q->list, &hb->chain);
898 q->task = current; 1240 q->task = current;
899 spin_unlock(&hb->lock); 1241 spin_unlock(&hb->lock);
900} 1242}
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
949 spin_unlock(lock_ptr); 1291 spin_unlock(lock_ptr);
950 goto retry; 1292 goto retry;
951 } 1293 }
952 WARN_ON(list_empty(&q->list)); 1294 WARN_ON(plist_node_empty(&q->list));
953 list_del(&q->list); 1295 plist_del(&q->list, &q->list.plist);
954 1296
955 BUG_ON(q->pi_state); 1297 BUG_ON(q->pi_state);
956 1298
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
964 1306
965/* 1307/*
966 * PI futexes can not be requeued and must remove themself from the 1308 * PI futexes can not be requeued and must remove themself from the
967 * hash bucket. The hash bucket lock is held on entry and dropped here. 1309 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1310 * and dropped here.
968 */ 1311 */
969static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) 1312static void unqueue_me_pi(struct futex_q *q)
970{ 1313{
971 WARN_ON(list_empty(&q->list)); 1314 WARN_ON(plist_node_empty(&q->list));
972 list_del(&q->list); 1315 plist_del(&q->list, &q->list.plist);
973 1316
974 BUG_ON(!q->pi_state); 1317 BUG_ON(!q->pi_state);
975 free_pi_state(q->pi_state); 1318 free_pi_state(q->pi_state);
976 q->pi_state = NULL; 1319 q->pi_state = NULL;
977 1320
978 spin_unlock(&hb->lock); 1321 spin_unlock(q->lock_ptr);
979 1322
980 drop_futex_key_refs(&q->key); 1323 drop_futex_key_refs(&q->key);
981} 1324}
982 1325
1326/*
1327 * Fixup the pi_state owner with current.
1328 *
1329 * The cur->mm semaphore must be held, it is released at return of this
1330 * function.
1331 */
1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1334 struct futex_hash_bucket *hb,
1335 struct task_struct *curr)
1336{
1337 u32 newtid = curr->pid | FUTEX_WAITERS;
1338 struct futex_pi_state *pi_state = q->pi_state;
1339 u32 uval, curval, newval;
1340 int ret;
1341
1342 /* Owner died? */
1343 if (pi_state->owner != NULL) {
1344 spin_lock_irq(&pi_state->owner->pi_lock);
1345 WARN_ON(list_empty(&pi_state->list));
1346 list_del_init(&pi_state->list);
1347 spin_unlock_irq(&pi_state->owner->pi_lock);
1348 } else
1349 newtid |= FUTEX_OWNER_DIED;
1350
1351 pi_state->owner = curr;
1352
1353 spin_lock_irq(&curr->pi_lock);
1354 WARN_ON(!list_empty(&pi_state->list));
1355 list_add(&pi_state->list, &curr->pi_state_list);
1356 spin_unlock_irq(&curr->pi_lock);
1357
1358 /* Unqueue and drop the lock */
1359 unqueue_me_pi(q);
1360 if (fshared)
1361 up_read(fshared);
1362 /*
1363 * We own it, so we have to replace the pending owner
1364 * TID. This must be atomic as we have preserve the
1365 * owner died bit here.
1366 */
1367 ret = get_user(uval, uaddr);
1368 while (!ret) {
1369 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1370 newval |= (uval & FUTEX_WAITER_REQUEUED);
1371 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1372 uval, newval);
1373 if (curval == -EFAULT)
1374 ret = -EFAULT;
1375 if (curval == uval)
1376 break;
1377 uval = curval;
1378 }
1379 return ret;
1380}
1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
983static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
984static int futex_wait_abstime(u32 __user *uaddr, u32 val, 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
985 int timed, unsigned long abs_time) 1390 u32 val, ktime_t *abs_time)
986{ 1391{
987 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
988 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
989 struct futex_hash_bucket *hb; 1394 struct futex_hash_bucket *hb;
990 struct futex_q q; 1395 struct futex_q q;
991 unsigned long time_left = 0;
992 u32 uval; 1396 u32 uval;
993 int ret; 1397 int ret;
1398 struct hrtimer_sleeper t, *to = NULL;
1399 int rem = 0;
994 1400
995 q.pi_state = NULL; 1401 q.pi_state = NULL;
996 retry: 1402 retry:
997 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
998 1405
999 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1000 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1001 goto out_release_sem; 1408 goto out_release_sem;
1002 1409
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1019 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1020 * rare, but normal. 1427 * rare, but normal.
1021 * 1428 *
1022 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1023 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1024 */ 1431 */
1025 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1026 1433
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1031 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1032 * start all over again. 1439 * start all over again.
1033 */ 1440 */
1034 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1035 1443
1036 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1037 1445
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1043 if (uval != val) 1451 if (uval != val)
1044 goto out_unlock_release_sem; 1452 goto out_unlock_release_sem;
1045 1453
1454 /*
1455 * This rt_mutex_waiter structure is prepared here and will
1456 * be used only if this task is requeued from a normal futex to
1457 * a PI-futex with futex_requeue_pi.
1458 */
1459 debug_rt_mutex_init_waiter(&q.waiter);
1460 q.waiter.task = NULL;
1461
1046 /* Only actually queue if *uaddr contained val. */ 1462 /* Only actually queue if *uaddr contained val. */
1047 __queue_me(&q, hb); 1463 __queue_me(&q, hb);
1048 1464
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1050 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1051 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1052 */ 1468 */
1053 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1054 1471
1055 /* 1472 /*
1056 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1065 __set_current_state(TASK_INTERRUPTIBLE); 1482 __set_current_state(TASK_INTERRUPTIBLE);
1066 add_wait_queue(&q.waiters, &wait); 1483 add_wait_queue(&q.waiters, &wait);
1067 /* 1484 /*
1068 * !list_empty() is safe here without any lock. 1485 * !plist_node_empty() is safe here without any lock.
1069 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1486 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1070 */ 1487 */
1071 time_left = 0; 1488 if (likely(!plist_node_empty(&q.list))) {
1072 if (likely(!list_empty(&q.list))) { 1489 if (!abs_time)
1073 unsigned long rel_time; 1490 schedule();
1074 1491 else {
1075 if (timed) { 1492 to = &t;
1076 unsigned long now = jiffies; 1493 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1077 if (time_after(now, abs_time)) 1494 hrtimer_init_sleeper(&t, current);
1078 rel_time = 0; 1495 t.timer.expires = *abs_time;
1079 else
1080 rel_time = abs_time - now;
1081 } else
1082 rel_time = MAX_SCHEDULE_TIMEOUT;
1083 1496
1084 time_left = schedule_timeout(rel_time); 1497 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
1498
1499 /*
1500 * the timer could have already expired, in which
1501 * case current would be flagged for rescheduling.
1502 * Don't bother calling schedule.
1503 */
1504 if (likely(t.task))
1505 schedule();
1506
1507 hrtimer_cancel(&t.timer);
1508
1509 /* Flag if a timeout occured */
1510 rem = (t.task == NULL);
1511 }
1085 } 1512 }
1086 __set_current_state(TASK_RUNNING); 1513 __set_current_state(TASK_RUNNING);
1087 1514
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1090 * we are the only user of it. 1517 * we are the only user of it.
1091 */ 1518 */
1092 1519
1520 if (q.pi_state) {
1521 /*
1522 * We were woken but have been requeued on a PI-futex.
1523 * We have to complete the lock acquisition by taking
1524 * the rtmutex.
1525 */
1526
1527 struct rt_mutex *lock = &q.pi_state->pi_mutex;
1528
1529 spin_lock(&lock->wait_lock);
1530 if (unlikely(q.waiter.task)) {
1531 remove_waiter(lock, &q.waiter);
1532 }
1533 spin_unlock(&lock->wait_lock);
1534
1535 if (rem)
1536 ret = -ETIMEDOUT;
1537 else
1538 ret = rt_mutex_timed_lock(lock, to, 1);
1539
1540 if (fshared)
1541 down_read(fshared);
1542 spin_lock(q.lock_ptr);
1543
1544 /*
1545 * Got the lock. We might not be the anticipated owner if we
1546 * did a lock-steal - fix up the PI-state in that case.
1547 */
1548 if (!ret && q.pi_state->owner != curr) {
1549 /*
1550 * We MUST play with the futex we were requeued on,
1551 * NOT the current futex.
1552 * We can retrieve it from the key of the pi_state
1553 */
1554 uaddr = q.pi_state->key.uaddr;
1555
1556 /* mmap_sem and hash_bucket lock are unlocked at
1557 return of this function */
1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1560 } else {
1561 /*
1562 * Catch the rare case, where the lock was released
1563 * when we were on the way back before we locked
1564 * the hash bucket.
1565 */
1566 if (ret && q.pi_state->owner == curr) {
1567 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1568 ret = 0;
1569 }
1570 /* Unqueue and drop the lock */
1571 unqueue_me_pi(&q);
1572 if (fshared)
1573 up_read(fshared);
1574 }
1575
1576 debug_rt_mutex_free_waiter(&q.waiter);
1577
1578 return ret;
1579 }
1580
1581 debug_rt_mutex_free_waiter(&q.waiter);
1582
1093 /* If we were woken (and unqueued), we succeeded, whatever. */ 1583 /* If we were woken (and unqueued), we succeeded, whatever. */
1094 if (!unqueue_me(&q)) 1584 if (!unqueue_me(&q))
1095 return 0; 1585 return 0;
1096 if (time_left == 0) 1586 if (rem)
1097 return -ETIMEDOUT; 1587 return -ETIMEDOUT;
1098 1588
1099 /* 1589 /*
1100 * We expect signal_pending(current), but another thread may 1590 * We expect signal_pending(current), but another thread may
1101 * have handled it for us already. 1591 * have handled it for us already.
1102 */ 1592 */
1103 if (time_left == MAX_SCHEDULE_TIMEOUT) 1593 if (!abs_time)
1104 return -ERESTARTSYS; 1594 return -ERESTARTSYS;
1105 else { 1595 else {
1106 struct restart_block *restart; 1596 struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1108 restart->fn = futex_wait_restart; 1598 restart->fn = futex_wait_restart;
1109 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1110 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1111 restart->arg2 = (unsigned long)timed; 1601 restart->arg2 = (unsigned long)abs_time;
1112 restart->arg3 = abs_time; 1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1113 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1114 } 1606 }
1115 1607
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1117 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1118 1610
1119 out_release_sem: 1611 out_release_sem:
1120 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1121 return ret; 1614 return ret;
1122} 1615}
1123 1616
1124static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
1125{
1126 int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
1127 return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
1128}
1129 1617
1130static long futex_wait_restart(struct restart_block *restart) 1618static long futex_wait_restart(struct restart_block *restart)
1131{ 1619{
1132 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1133 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1134 int timed = (int)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1135 unsigned long abs_time = restart->arg3; 1623 struct rw_semaphore *fshared = NULL;
1136 1624
1137 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1138 return (long)futex_wait_abstime(uaddr, val, timed, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1139} 1629}
1140 1630
1141 1631
1632static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1633 union futex_key *key, struct task_struct *p)
1634{
1635 struct plist_head *head;
1636 struct futex_q *this, *next;
1637 struct futex_pi_state *pi_state = NULL;
1638 struct rt_mutex *lock;
1639
1640 /* Search a waiter that should already exists */
1641
1642 head = &hb->chain;
1643
1644 plist_for_each_entry_safe(this, next, head, list) {
1645 if (match_futex (&this->key, key)) {
1646 pi_state = this->pi_state;
1647 break;
1648 }
1649 }
1650
1651 BUG_ON(!pi_state);
1652
1653 /* set p as pi_state's owner */
1654 lock = &pi_state->pi_mutex;
1655
1656 spin_lock(&lock->wait_lock);
1657 spin_lock_irq(&p->pi_lock);
1658
1659 list_add(&pi_state->list, &p->pi_state_list);
1660 pi_state->owner = p;
1661
1662
1663 /* set p as pi_mutex's owner */
1664 debug_rt_mutex_proxy_lock(lock, p);
1665 WARN_ON(rt_mutex_owner(lock));
1666 rt_mutex_set_owner(lock, p, 0);
1667 rt_mutex_deadlock_account_lock(lock, p);
1668
1669 plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
1670 &p->pi_waiters);
1671 __rt_mutex_adjust_prio(p);
1672
1673 spin_unlock_irq(&p->pi_lock);
1674 spin_unlock(&lock->wait_lock);
1675}
1676
1142/* 1677/*
1143 * Userspace tried a 0 -> TID atomic transition of the futex value 1678 * Userspace tried a 0 -> TID atomic transition of the futex value
1144 * and failed. The kernel side here does the whole locking operation: 1679 * and failed. The kernel side here does the whole locking operation:
1145 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1146 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1147 */ 1682 */
1148static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1149 long nsec, int trylock) 1684 int detect, ktime_t *time, int trylock)
1150{ 1685{
1151 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1152 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
1153 struct futex_hash_bucket *hb; 1688 struct futex_hash_bucket *hb;
1154 u32 uval, newval, curval; 1689 u32 uval, newval, curval;
1155 struct futex_q q; 1690 struct futex_q q;
1156 int ret, attempt = 0; 1691 int ret, lock_held, attempt = 0;
1157 1692
1158 if (refill_pi_state_cache()) 1693 if (refill_pi_state_cache())
1159 return -ENOMEM; 1694 return -ENOMEM;
1160 1695
1161 if (sec != MAX_SCHEDULE_TIMEOUT) { 1696 if (time) {
1162 to = &timeout; 1697 to = &timeout;
1163 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1698 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1164 hrtimer_init_sleeper(to, current); 1699 hrtimer_init_sleeper(to, current);
1165 to->timer.expires = ktime_set(sec, nsec); 1700 to->timer.expires = *time;
1166 } 1701 }
1167 1702
1168 q.pi_state = NULL; 1703 q.pi_state = NULL;
1169 retry: 1704 retry:
1170 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1171 1707
1172 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1173 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1174 goto out_release_sem; 1710 goto out_release_sem;
1175 1711
1176 hb = queue_lock(&q, -1, NULL); 1712 hb = queue_lock(&q, -1, NULL);
1177 1713
1178 retry_locked: 1714 retry_locked:
1715 lock_held = 0;
1716
1179 /* 1717 /*
1180 * To avoid races, we attempt to take the lock here again 1718 * To avoid races, we attempt to take the lock here again
1181 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1719 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1194 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1732 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1195 if (!detect && 0) 1733 if (!detect && 0)
1196 force_sig(SIGKILL, current); 1734 force_sig(SIGKILL, current);
1197 ret = -EDEADLK; 1735 /*
1736 * Normally, this check is done in user space.
1737 * In case of requeue, the owner may attempt to lock this futex,
1738 * even if the ownership has already been given by the previous
1739 * waker.
1740 * In the usual case, this is a case of deadlock, but not in case
1741 * of REQUEUE_PI.
1742 */
1743 if (!(curval & FUTEX_WAITER_REQUEUED))
1744 ret = -EDEADLK;
1198 goto out_unlock_release_sem; 1745 goto out_unlock_release_sem;
1199 } 1746 }
1200 1747
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1206 goto out_unlock_release_sem; 1753 goto out_unlock_release_sem;
1207 1754
1208 uval = curval; 1755 uval = curval;
1209 newval = uval | FUTEX_WAITERS; 1756 /*
1757 * In case of a requeue, check if there already is an owner
1758 * If not, just take the futex.
1759 */
1760 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
1761 /* set current as futex owner */
1762 newval = curval | current->pid;
1763 lock_held = 1;
1764 } else
1765 /* Set the WAITERS flag, so the owner will know it has someone
1766 to wake at next unlock */
1767 newval = curval | FUTEX_WAITERS;
1210 1768
1211 pagefault_disable(); 1769 pagefault_disable();
1212 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1770 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1217 if (unlikely(curval != uval)) 1775 if (unlikely(curval != uval))
1218 goto retry_locked; 1776 goto retry_locked;
1219 1777
1778 if (lock_held) {
1779 set_pi_futex_owner(hb, &q.key, curr);
1780 goto out_unlock_release_sem;
1781 }
1782
1220 /* 1783 /*
1221 * We dont have the lock. Look up the PI state (or create it if 1784 * We dont have the lock. Look up the PI state (or create it if
1222 * we are the first waiter): 1785 * we are the first waiter):
1223 */ 1786 */
1224 ret = lookup_pi_state(uval, hb, &q); 1787 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1225 1788
1226 if (unlikely(ret)) { 1789 if (unlikely(ret)) {
1227 /* 1790 /*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1263 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1264 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1265 */ 1828 */
1266 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1267 1831
1268 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1269 /* 1833 /*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1277 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1278 } 1842 }
1279 1843
1280 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1281 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1282 1847
1283 /* 1848 /*
1284 * Got the lock. We might not be the anticipated owner if we 1849 * Got the lock. We might not be the anticipated owner if we
1285 * did a lock-steal - fix up the PI-state in that case. 1850 * did a lock-steal - fix up the PI-state in that case.
1286 */ 1851 */
1287 if (!ret && q.pi_state->owner != curr) { 1852 if (!ret && q.pi_state->owner != curr)
1288 u32 newtid = current->pid | FUTEX_WAITERS; 1853 /* mmap_sem is unlocked at return of this function */
1289 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1290 /* Owner died? */ 1855 else {
1291 if (q.pi_state->owner != NULL) {
1292 spin_lock_irq(&q.pi_state->owner->pi_lock);
1293 WARN_ON(list_empty(&q.pi_state->list));
1294 list_del_init(&q.pi_state->list);
1295 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1296 } else
1297 newtid |= FUTEX_OWNER_DIED;
1298
1299 q.pi_state->owner = current;
1300
1301 spin_lock_irq(&current->pi_lock);
1302 WARN_ON(!list_empty(&q.pi_state->list));
1303 list_add(&q.pi_state->list, &current->pi_state_list);
1304 spin_unlock_irq(&current->pi_lock);
1305
1306 /* Unqueue and drop the lock */
1307 unqueue_me_pi(&q, hb);
1308 up_read(&curr->mm->mmap_sem);
1309 /*
1310 * We own it, so we have to replace the pending owner
1311 * TID. This must be atomic as we have preserve the
1312 * owner died bit here.
1313 */
1314 ret = get_user(uval, uaddr);
1315 while (!ret) {
1316 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1317 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1318 uval, newval);
1319 if (curval == -EFAULT)
1320 ret = -EFAULT;
1321 if (curval == uval)
1322 break;
1323 uval = curval;
1324 }
1325 } else {
1326 /* 1856 /*
1327 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
1328 * when we were on the way back before we locked 1858 * when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1333 ret = 0; 1863 ret = 0;
1334 } 1864 }
1335 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1336 unqueue_me_pi(&q, hb); 1866 unqueue_me_pi(&q);
1337 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1338 } 1869 }
1339 1870
1340 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1346 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1347 1878
1348 out_release_sem: 1879 out_release_sem:
1349 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1350 return ret; 1882 return ret;
1351 1883
1352 uaddr_faulted: 1884 uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1357 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1358 */ 1890 */
1359 if (attempt++) { 1891 if (attempt++) {
1360 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1361 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1362 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1363 }
1364 goto retry_locked; 1896 goto retry_locked;
1365 } 1897 }
1366 1898
1367 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1368 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1369 1902
1370 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1371 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1379 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1380 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1381 */ 1914 */
1382static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1383{ 1916{
1384 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1385 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
1386 u32 uval; 1919 u32 uval;
1387 struct list_head *head; 1920 struct plist_head *head;
1388 union futex_key key; 1921 union futex_key key;
1389 int ret, attempt = 0; 1922 int ret, attempt = 0;
1390 1923
@@ -1399,9 +1932,10 @@ retry:
1399 /* 1932 /*
1400 * First take all the futex related locks: 1933 * First take all the futex related locks:
1401 */ 1934 */
1402 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1403 1937
1404 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1405 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1406 goto out; 1940 goto out;
1407 1941
@@ -1435,7 +1969,7 @@ retry_locked:
1435 */ 1969 */
1436 head = &hb->chain; 1970 head = &hb->chain;
1437 1971
1438 list_for_each_entry_safe(this, next, head, list) { 1972 plist_for_each_entry_safe(this, next, head, list) {
1439 if (!match_futex (&this->key, &key)) 1973 if (!match_futex (&this->key, &key))
1440 continue; 1974 continue;
1441 ret = wake_futex_pi(uaddr, uval, this); 1975 ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
1460out_unlock: 1994out_unlock:
1461 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1462out: 1996out:
1463 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1464 1999
1465 return ret; 2000 return ret;
1466 2001
@@ -1472,15 +2007,16 @@ pi_faulted:
1472 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1473 */ 2008 */
1474 if (attempt++) { 2009 if (attempt++) {
1475 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1476 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1477 goto out_unlock; 2013 goto out_unlock;
1478 }
1479 goto retry_locked; 2014 goto retry_locked;
1480 } 2015 }
1481 2016
1482 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1483 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1484 2020
1485 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1486 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
1509 poll_wait(filp, &q->waiters, wait); 2045 poll_wait(filp, &q->waiters, wait);
1510 2046
1511 /* 2047 /*
1512 * list_empty() is safe here without any lock. 2048 * plist_node_empty() is safe here without any lock.
1513 * q->lock_ptr != 0 is not safe, because of ordering against wakeup. 2049 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1514 */ 2050 */
1515 if (list_empty(&q->list)) 2051 if (plist_node_empty(&q->list))
1516 ret = POLLIN | POLLRDNORM; 2052 ret = POLLIN | POLLRDNORM;
1517 2053
1518 return ret; 2054 return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1532 struct futex_q *q; 2068 struct futex_q *q;
1533 struct file *filp; 2069 struct file *filp;
1534 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1535 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1536 2073
1537 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
1573 } 2110 }
1574 q->pi_state = NULL; 2111 q->pi_state = NULL;
1575 2112
1576 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
1577 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
1578 2116
1579 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
1580 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
1581 kfree(q); 2119 kfree(q);
1582 goto error; 2120 goto error;
1583 } 2121 }
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1589 filp->private_data = q; 2127 filp->private_data = q;
1590 2128
1591 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
1592 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
1593 2131
1594 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
1595 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
1702 * userspace. 2240 * userspace.
1703 */ 2241 */
1704 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2242 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2243 /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
2244 mval |= (uval & FUTEX_WAITER_REQUEUED);
1705 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2245 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1706 2246
1707 if (nval == -EFAULT) 2247 if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
1716 */ 2256 */
1717 if (!pi) { 2257 if (!pi) {
1718 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
1719 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1720 } 2260 }
1721 } 2261 }
1722 return 0; 2262 return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
1772 return; 2312 return;
1773 2313
1774 if (pending) 2314 if (pending)
1775 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
1776 2317
1777 while (entry != &head->list) { 2318 while (entry != &head->list) {
1778 /* 2319 /*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
1798 } 2339 }
1799} 2340}
1800 2341
1801long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, 2342long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1802 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
1803{ 2344{
1804 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
1805 2351
1806 switch (op) { 2352 switch (cmd) {
1807 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
1808 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
1809 break; 2355 break;
1810 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
1811 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
1812 break; 2358 break;
1813 case FUTEX_FD: 2359 case FUTEX_FD:
1814 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
1815 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
1816 break; 2362 break;
1817 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
1818 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
1819 break; 2365 break;
1820 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
1821 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
1822 break; 2368 break;
1823 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
1824 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
1825 break; 2371 break;
1826 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
1827 ret = futex_lock_pi(uaddr, val, timeout, val2, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
1828 break; 2374 break;
1829 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
1830 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
1831 break; 2377 break;
1832 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
1833 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2380 break;
2381 case FUTEX_CMP_REQUEUE_PI:
2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
1834 break; 2383 break;
1835 default: 2384 default:
1836 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1843 struct timespec __user *utime, u32 __user *uaddr2, 2392 struct timespec __user *utime, u32 __user *uaddr2,
1844 u32 val3) 2393 u32 val3)
1845{ 2394{
1846 struct timespec t; 2395 struct timespec ts;
1847 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 2396 ktime_t t, *tp = NULL;
1848 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
1849 2399
1850 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
1851 if (copy_from_user(&t, utime, sizeof(t)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1852 return -EFAULT; 2402 return -EFAULT;
1853 if (!timespec_valid(&t)) 2403 if (!timespec_valid(&ts))
1854 return -EINVAL; 2404 return -EINVAL;
1855 if (op == FUTEX_WAIT) 2405
1856 timeout = timespec_to_jiffies(&t) + 1; 2406 t = timespec_to_ktime(ts);
1857 else { 2407 if (cmd == FUTEX_WAIT)
1858 timeout = t.tv_sec; 2408 t = ktime_add(ktime_get(), t);
1859 val2 = t.tv_nsec; 2409 tp = &t;
1860 }
1861 } 2410 }
1862 /* 2411 /*
1863 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
1864 */ 2413 */
1865 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2415 || cmd == FUTEX_CMP_REQUEUE_PI)
1866 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
1867 2417
1868 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
1869} 2419}
1870 2420
1871static int futexfs_get_sb(struct file_system_type *fs_type, 2421static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
1895 } 2445 }
1896 2446
1897 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2447 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1898 INIT_LIST_HEAD(&futex_queues[i].chain); 2448 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
1899 spin_lock_init(&futex_queues[i].lock); 2449 spin_lock_init(&futex_queues[i].lock);
1900 } 2450 }
1901 return 0; 2451 return 0;
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 50f24eea6cd0..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -141,24 +141,24 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
141 struct compat_timespec __user *utime, u32 __user *uaddr2, 141 struct compat_timespec __user *utime, u32 __user *uaddr2,
142 u32 val3) 142 u32 val3)
143{ 143{
144 struct timespec t; 144 struct timespec ts;
145 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 145 ktime_t t, *tp = NULL;
146 int val2 = 0; 146 int val2 = 0;
147 147
148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
149 if (get_compat_timespec(&t, utime)) 149 if (get_compat_timespec(&ts, utime))
150 return -EFAULT; 150 return -EFAULT;
151 if (!timespec_valid(&t)) 151 if (!timespec_valid(&ts))
152 return -EINVAL; 152 return -EINVAL;
153
154 t = timespec_to_ktime(ts);
153 if (op == FUTEX_WAIT) 155 if (op == FUTEX_WAIT)
154 timeout = timespec_to_jiffies(&t) + 1; 156 t = ktime_add(ktime_get(), t);
155 else { 157 tp = &t;
156 timeout = t.tv_sec;
157 val2 = t.tv_nsec;
158 }
159 } 158 }
160 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 159 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
160 || op == FUTEX_CMP_REQUEUE_PI)
161 val2 = (int) (unsigned long) utime; 161 val2 = (int) (unsigned long) utime;
162 162
163 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 163 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
164} 164}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c9f4f044a8a8..23c03f43e196 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1411,11 +1411,13 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
1411 switch (action) { 1411 switch (action) {
1412 1412
1413 case CPU_UP_PREPARE: 1413 case CPU_UP_PREPARE:
1414 case CPU_UP_PREPARE_FROZEN:
1414 init_hrtimers_cpu(cpu); 1415 init_hrtimers_cpu(cpu);
1415 break; 1416 break;
1416 1417
1417#ifdef CONFIG_HOTPLUG_CPU 1418#ifdef CONFIG_HOTPLUG_CPU
1418 case CPU_DEAD: 1419 case CPU_DEAD:
1420 case CPU_DEAD_FROZEN:
1419 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu); 1421 clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &cpu);
1420 migrate_hrtimers(cpu); 1422 migrate_hrtimers(cpu);
1421 break; 1423 break;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 32e1ab1477d1..e391cbb1f566 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -22,7 +22,6 @@
22 * handle_bad_irq - handle spurious and unhandled irqs 22 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 23 * @irq: the interrupt number
24 * @desc: description of the interrupt 24 * @desc: description of the interrupt
25 * @regs: pointer to a register structure
26 * 25 *
27 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
28 */ 27 */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 49cc4b9c1a8d..4d32eb077179 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -135,7 +135,6 @@ static int ____call_usermodehelper(void *data)
135 135
136 /* Unblock all signals and set the session keyring. */ 136 /* Unblock all signals and set the session keyring. */
137 new_session = key_get(sub_info->ring); 137 new_session = key_get(sub_info->ring);
138 flush_signals(current);
139 spin_lock_irq(&current->sighand->siglock); 138 spin_lock_irq(&current->sighand->siglock);
140 old_session = __install_session_keyring(current, new_session); 139 old_session = __install_session_keyring(current, new_session);
141 flush_signal_handlers(current, 1); 140 flush_signal_handlers(current, 1);
@@ -186,14 +185,9 @@ static int wait_for_helper(void *data)
186{ 185{
187 struct subprocess_info *sub_info = data; 186 struct subprocess_info *sub_info = data;
188 pid_t pid; 187 pid_t pid;
189 struct k_sigaction sa;
190 188
191 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 189 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't
192 * populate the status, but will return -ECHILD. */ 190 * populate the status, but will return -ECHILD. */
193 sa.sa.sa_handler = SIG_IGN;
194 sa.sa.sa_flags = 0;
195 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
196 do_sigaction(SIGCHLD, &sa, NULL);
197 allow_signal(SIGCHLD); 191 allow_signal(SIGCHLD);
198 192
199 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 193 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 87c50ccd1d4e..df8a8e8f6ca4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,7 +1,7 @@
1/* Kernel thread helper functions. 1/* Kernel thread helper functions.
2 * Copyright (C) 2004 IBM Corporation, Rusty Russell. 2 * Copyright (C) 2004 IBM Corporation, Rusty Russell.
3 * 3 *
4 * Creation is done via keventd, so that we get a clean environment 4 * Creation is done via kthreadd, so that we get a clean environment
5 * even if we're invoked from userspace (think modprobe, hotplug cpu, 5 * even if we're invoked from userspace (think modprobe, hotplug cpu,
6 * etc.). 6 * etc.).
7 */ 7 */
@@ -15,24 +15,22 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18/* 18static DEFINE_SPINLOCK(kthread_create_lock);
19 * We dont want to execute off keventd since it might 19static LIST_HEAD(kthread_create_list);
20 * hold a semaphore our callers hold too: 20struct task_struct *kthreadd_task;
21 */
22static struct workqueue_struct *helper_wq;
23 21
24struct kthread_create_info 22struct kthread_create_info
25{ 23{
26 /* Information passed to kthread() from keventd. */ 24 /* Information passed to kthread() from kthreadd. */
27 int (*threadfn)(void *data); 25 int (*threadfn)(void *data);
28 void *data; 26 void *data;
29 struct completion started; 27 struct completion started;
30 28
31 /* Result passed back to kthread_create() from keventd. */ 29 /* Result passed back to kthread_create() from kthreadd. */
32 struct task_struct *result; 30 struct task_struct *result;
33 struct completion done; 31 struct completion done;
34 32
35 struct work_struct work; 33 struct list_head list;
36}; 34};
37 35
38struct kthread_stop_info 36struct kthread_stop_info
@@ -60,42 +58,17 @@ int kthread_should_stop(void)
60} 58}
61EXPORT_SYMBOL(kthread_should_stop); 59EXPORT_SYMBOL(kthread_should_stop);
62 60
63static void kthread_exit_files(void)
64{
65 struct fs_struct *fs;
66 struct task_struct *tsk = current;
67
68 exit_fs(tsk); /* current->fs->count--; */
69 fs = init_task.fs;
70 tsk->fs = fs;
71 atomic_inc(&fs->count);
72 exit_files(tsk);
73 current->files = init_task.files;
74 atomic_inc(&tsk->files->count);
75}
76
77static int kthread(void *_create) 61static int kthread(void *_create)
78{ 62{
79 struct kthread_create_info *create = _create; 63 struct kthread_create_info *create = _create;
80 int (*threadfn)(void *data); 64 int (*threadfn)(void *data);
81 void *data; 65 void *data;
82 sigset_t blocked;
83 int ret = -EINTR; 66 int ret = -EINTR;
84 67
85 kthread_exit_files(); 68 /* Copy data: it's on kthread's stack */
86
87 /* Copy data: it's on keventd's stack */
88 threadfn = create->threadfn; 69 threadfn = create->threadfn;
89 data = create->data; 70 data = create->data;
90 71
91 /* Block and flush all signals (in case we're not from keventd). */
92 sigfillset(&blocked);
93 sigprocmask(SIG_BLOCK, &blocked, NULL);
94 flush_signals(current);
95
96 /* By default we can run anywhere, unlike keventd. */
97 set_cpus_allowed(current, CPU_MASK_ALL);
98
99 /* OK, tell user we're spawned, wait for stop or wakeup */ 72 /* OK, tell user we're spawned, wait for stop or wakeup */
100 __set_current_state(TASK_INTERRUPTIBLE); 73 __set_current_state(TASK_INTERRUPTIBLE);
101 complete(&create->started); 74 complete(&create->started);
@@ -112,11 +85,8 @@ static int kthread(void *_create)
112 return 0; 85 return 0;
113} 86}
114 87
115/* We are keventd: create a thread. */ 88static void create_kthread(struct kthread_create_info *create)
116static void keventd_create_kthread(struct work_struct *work)
117{ 89{
118 struct kthread_create_info *create =
119 container_of(work, struct kthread_create_info, work);
120 int pid; 90 int pid;
121 91
122 /* We want our own signal handler (we take no signals by default). */ 92 /* We want our own signal handler (we take no signals by default). */
@@ -162,17 +132,14 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
162 create.data = data; 132 create.data = data;
163 init_completion(&create.started); 133 init_completion(&create.started);
164 init_completion(&create.done); 134 init_completion(&create.done);
165 INIT_WORK(&create.work, keventd_create_kthread); 135
166 136 spin_lock(&kthread_create_lock);
167 /* 137 list_add_tail(&create.list, &kthread_create_list);
168 * The workqueue needs to start up first: 138 wake_up_process(kthreadd_task);
169 */ 139 spin_unlock(&kthread_create_lock);
170 if (!helper_wq) 140
171 create.work.func(&create.work); 141 wait_for_completion(&create.done);
172 else { 142
173 queue_work(helper_wq, &create.work);
174 wait_for_completion(&create.done);
175 }
176 if (!IS_ERR(create.result)) { 143 if (!IS_ERR(create.result)) {
177 va_list args; 144 va_list args;
178 va_start(args, namefmt); 145 va_start(args, namefmt);
@@ -180,7 +147,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
180 namefmt, args); 147 namefmt, args);
181 va_end(args); 148 va_end(args);
182 } 149 }
183
184 return create.result; 150 return create.result;
185} 151}
186EXPORT_SYMBOL(kthread_create); 152EXPORT_SYMBOL(kthread_create);
@@ -245,12 +211,47 @@ int kthread_stop(struct task_struct *k)
245} 211}
246EXPORT_SYMBOL(kthread_stop); 212EXPORT_SYMBOL(kthread_stop);
247 213
248static __init int helper_init(void) 214
215static __init void kthreadd_setup(void)
249{ 216{
250 helper_wq = create_singlethread_workqueue("kthread"); 217 struct task_struct *tsk = current;
251 BUG_ON(!helper_wq);
252 218
253 return 0; 219 set_task_comm(tsk, "kthreadd");
220
221 ignore_signals(tsk);
222
223 set_user_nice(tsk, -5);
224 set_cpus_allowed(tsk, CPU_MASK_ALL);
254} 225}
255 226
256core_initcall(helper_init); 227int kthreadd(void *unused)
228{
229 /* Setup a clean context for our children to inherit. */
230 kthreadd_setup();
231
232 current->flags |= PF_NOFREEZE;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236 if (list_empty(&kthread_create_list))
237 schedule();
238 __set_current_state(TASK_RUNNING);
239
240 spin_lock(&kthread_create_lock);
241 while (!list_empty(&kthread_create_list)) {
242 struct kthread_create_info *create;
243
244 create = list_entry(kthread_create_list.next,
245 struct kthread_create_info, list);
246 list_del_init(&create->list);
247 spin_unlock(&kthread_create_lock);
248
249 create_kthread(create);
250
251 spin_lock(&kthread_create_lock);
252 }
253 spin_unlock(&kthread_create_lock);
254 }
255
256 return 0;
257}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e7cbbb82765b..303eab18484b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -133,7 +133,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
133 133
134 debug_mutex_lock_common(lock, &waiter); 134 debug_mutex_lock_common(lock, &waiter);
135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
136 debug_mutex_add_waiter(lock, &waiter, task->thread_info); 136 debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
137 137
138 /* add waiting tasks to the end of the waitqueue (FIFO): */ 138 /* add waiting tasks to the end of the waitqueue (FIFO): */
139 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
@@ -159,7 +159,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
159 */ 159 */
160 if (unlikely(state == TASK_INTERRUPTIBLE && 160 if (unlikely(state == TASK_INTERRUPTIBLE &&
161 signal_pending(task))) { 161 signal_pending(task))) {
162 mutex_remove_waiter(lock, &waiter, task->thread_info); 162 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
163 mutex_release(&lock->dep_map, 1, _RET_IP_); 163 mutex_release(&lock->dep_map, 1, _RET_IP_);
164 spin_unlock_mutex(&lock->wait_lock, flags); 164 spin_unlock_mutex(&lock->wait_lock, flags);
165 165
@@ -175,8 +175,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
175 } 175 }
176 176
177 /* got the lock - rejoice! */ 177 /* got the lock - rejoice! */
178 mutex_remove_waiter(lock, &waiter, task->thread_info); 178 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
179 debug_mutex_set_owner(lock, task->thread_info); 179 debug_mutex_set_owner(lock, task_thread_info(task));
180 180
181 /* set it to 0 if there are no waiters left: */ 181 /* set it to 0 if there are no waiters left: */
182 if (likely(list_empty(&lock->wait_list))) 182 if (likely(list_empty(&lock->wait_list)))
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 06331374d862..b5f0543ed84d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,30 +30,69 @@ char resume_file[256] = CONFIG_PM_STD_PARTITION;
30dev_t swsusp_resume_device; 30dev_t swsusp_resume_device;
31sector_t swsusp_resume_block; 31sector_t swsusp_resume_block;
32 32
33enum {
34 HIBERNATION_INVALID,
35 HIBERNATION_PLATFORM,
36 HIBERNATION_TEST,
37 HIBERNATION_TESTPROC,
38 HIBERNATION_SHUTDOWN,
39 HIBERNATION_REBOOT,
40 /* keep last */
41 __HIBERNATION_AFTER_LAST
42};
43#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1)
44#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1)
45
46static int hibernation_mode = HIBERNATION_SHUTDOWN;
47
48struct hibernation_ops *hibernation_ops;
49
50/**
51 * hibernation_set_ops - set the global hibernate operations
52 * @ops: the hibernation operations to use in subsequent hibernation transitions
53 */
54
55void hibernation_set_ops(struct hibernation_ops *ops)
56{
57 if (ops && !(ops->prepare && ops->enter && ops->finish)) {
58 WARN_ON(1);
59 return;
60 }
61 mutex_lock(&pm_mutex);
62 hibernation_ops = ops;
63 if (ops)
64 hibernation_mode = HIBERNATION_PLATFORM;
65 else if (hibernation_mode == HIBERNATION_PLATFORM)
66 hibernation_mode = HIBERNATION_SHUTDOWN;
67
68 mutex_unlock(&pm_mutex);
69}
70
71
33/** 72/**
34 * platform_prepare - prepare the machine for hibernation using the 73 * platform_prepare - prepare the machine for hibernation using the
35 * platform driver if so configured and return an error code if it fails 74 * platform driver if so configured and return an error code if it fails
36 */ 75 */
37 76
38static inline int platform_prepare(void) 77static int platform_prepare(void)
39{ 78{
40 int error = 0; 79 return (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops) ?
80 hibernation_ops->prepare() : 0;
81}
41 82
42 switch (pm_disk_mode) { 83/**
43 case PM_DISK_TEST: 84 * platform_finish - switch the machine to the normal mode of operation
44 case PM_DISK_TESTPROC: 85 * using the platform driver (must be called after platform_prepare())
45 case PM_DISK_SHUTDOWN: 86 */
46 case PM_DISK_REBOOT: 87
47 break; 88static void platform_finish(void)
48 default: 89{
49 if (pm_ops && pm_ops->prepare) 90 if (hibernation_mode == HIBERNATION_PLATFORM && hibernation_ops)
50 error = pm_ops->prepare(PM_SUSPEND_DISK); 91 hibernation_ops->finish();
51 }
52 return error;
53} 92}
54 93
55/** 94/**
56 * power_down - Shut machine down for hibernate. 95 * power_down - Shut the machine down for hibernation.
57 * 96 *
58 * Use the platform driver, if configured so; otherwise try 97 * Use the platform driver, if configured so; otherwise try
59 * to power off or reboot. 98 * to power off or reboot.
@@ -61,20 +100,20 @@ static inline int platform_prepare(void)
61 100
62static void power_down(void) 101static void power_down(void)
63{ 102{
64 switch (pm_disk_mode) { 103 switch (hibernation_mode) {
65 case PM_DISK_TEST: 104 case HIBERNATION_TEST:
66 case PM_DISK_TESTPROC: 105 case HIBERNATION_TESTPROC:
67 break; 106 break;
68 case PM_DISK_SHUTDOWN: 107 case HIBERNATION_SHUTDOWN:
69 kernel_power_off(); 108 kernel_power_off();
70 break; 109 break;
71 case PM_DISK_REBOOT: 110 case HIBERNATION_REBOOT:
72 kernel_restart(NULL); 111 kernel_restart(NULL);
73 break; 112 break;
74 default: 113 case HIBERNATION_PLATFORM:
75 if (pm_ops && pm_ops->enter) { 114 if (hibernation_ops) {
76 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 115 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
77 pm_ops->enter(PM_SUSPEND_DISK); 116 hibernation_ops->enter();
78 break; 117 break;
79 } 118 }
80 } 119 }
@@ -87,20 +126,6 @@ static void power_down(void)
87 while(1); 126 while(1);
88} 127}
89 128
90static inline void platform_finish(void)
91{
92 switch (pm_disk_mode) {
93 case PM_DISK_TEST:
94 case PM_DISK_TESTPROC:
95 case PM_DISK_SHUTDOWN:
96 case PM_DISK_REBOOT:
97 break;
98 default:
99 if (pm_ops && pm_ops->finish)
100 pm_ops->finish(PM_SUSPEND_DISK);
101 }
102}
103
104static void unprepare_processes(void) 129static void unprepare_processes(void)
105{ 130{
106 thaw_processes(); 131 thaw_processes();
@@ -120,13 +145,10 @@ static int prepare_processes(void)
120} 145}
121 146
122/** 147/**
123 * pm_suspend_disk - The granpappy of hibernation power management. 148 * hibernate - The granpappy of the built-in hibernation management
124 *
125 * If not, then call swsusp to do its thing, then figure out how
126 * to power down the system.
127 */ 149 */
128 150
129int pm_suspend_disk(void) 151int hibernate(void)
130{ 152{
131 int error; 153 int error;
132 154
@@ -143,7 +165,8 @@ int pm_suspend_disk(void)
143 if (error) 165 if (error)
144 goto Finish; 166 goto Finish;
145 167
146 if (pm_disk_mode == PM_DISK_TESTPROC) { 168 mutex_lock(&pm_mutex);
169 if (hibernation_mode == HIBERNATION_TESTPROC) {
147 printk("swsusp debug: Waiting for 5 seconds.\n"); 170 printk("swsusp debug: Waiting for 5 seconds.\n");
148 mdelay(5000); 171 mdelay(5000);
149 goto Thaw; 172 goto Thaw;
@@ -168,7 +191,7 @@ int pm_suspend_disk(void)
168 if (error) 191 if (error)
169 goto Enable_cpus; 192 goto Enable_cpus;
170 193
171 if (pm_disk_mode == PM_DISK_TEST) { 194 if (hibernation_mode == HIBERNATION_TEST) {
172 printk("swsusp debug: Waiting for 5 seconds.\n"); 195 printk("swsusp debug: Waiting for 5 seconds.\n");
173 mdelay(5000); 196 mdelay(5000);
174 goto Enable_cpus; 197 goto Enable_cpus;
@@ -205,6 +228,7 @@ int pm_suspend_disk(void)
205 device_resume(); 228 device_resume();
206 resume_console(); 229 resume_console();
207 Thaw: 230 Thaw:
231 mutex_unlock(&pm_mutex);
208 unprepare_processes(); 232 unprepare_processes();
209 Finish: 233 Finish:
210 free_basic_memory_bitmaps(); 234 free_basic_memory_bitmaps();
@@ -220,7 +244,7 @@ int pm_suspend_disk(void)
220 * Called as a late_initcall (so all devices are discovered and 244 * Called as a late_initcall (so all devices are discovered and
221 * initialized), we call swsusp to see if we have a saved image or not. 245 * initialized), we call swsusp to see if we have a saved image or not.
222 * If so, we quiesce devices, the restore the saved image. We will 246 * If so, we quiesce devices, the restore the saved image. We will
223 * return above (in pm_suspend_disk() ) if everything goes well. 247 * return above (in hibernate() ) if everything goes well.
224 * Otherwise, we fail gracefully and return to the normally 248 * Otherwise, we fail gracefully and return to the normally
225 * scheduled program. 249 * scheduled program.
226 * 250 *
@@ -315,25 +339,26 @@ static int software_resume(void)
315late_initcall(software_resume); 339late_initcall(software_resume);
316 340
317 341
318static const char * const pm_disk_modes[] = { 342static const char * const hibernation_modes[] = {
319 [PM_DISK_PLATFORM] = "platform", 343 [HIBERNATION_PLATFORM] = "platform",
320 [PM_DISK_SHUTDOWN] = "shutdown", 344 [HIBERNATION_SHUTDOWN] = "shutdown",
321 [PM_DISK_REBOOT] = "reboot", 345 [HIBERNATION_REBOOT] = "reboot",
322 [PM_DISK_TEST] = "test", 346 [HIBERNATION_TEST] = "test",
323 [PM_DISK_TESTPROC] = "testproc", 347 [HIBERNATION_TESTPROC] = "testproc",
324}; 348};
325 349
326/** 350/**
327 * disk - Control suspend-to-disk mode 351 * disk - Control hibernation mode
328 * 352 *
329 * Suspend-to-disk can be handled in several ways. We have a few options 353 * Suspend-to-disk can be handled in several ways. We have a few options
330 * for putting the system to sleep - using the platform driver (e.g. ACPI 354 * for putting the system to sleep - using the platform driver (e.g. ACPI
331 * or other pm_ops), powering off the system or rebooting the system 355 * or other hibernation_ops), powering off the system or rebooting the
332 * (for testing) as well as the two test modes. 356 * system (for testing) as well as the two test modes.
333 * 357 *
334 * The system can support 'platform', and that is known a priori (and 358 * The system can support 'platform', and that is known a priori (and
335 * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot' 359 * encoded by the presence of hibernation_ops). However, the user may
336 * as alternatives, as well as the test modes 'test' and 'testproc'. 360 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
361 * test modes, 'test' or 'testproc'.
337 * 362 *
338 * show() will display what the mode is currently set to. 363 * show() will display what the mode is currently set to.
339 * store() will accept one of 364 * store() will accept one of
@@ -345,7 +370,7 @@ static const char * const pm_disk_modes[] = {
345 * 'testproc' 370 * 'testproc'
346 * 371 *
347 * It will only change to 'platform' if the system 372 * It will only change to 'platform' if the system
348 * supports it (as determined from pm_ops->pm_disk_mode). 373 * supports it (as determined by having hibernation_ops).
349 */ 374 */
350 375
351static ssize_t disk_show(struct kset *kset, char *buf) 376static ssize_t disk_show(struct kset *kset, char *buf)
@@ -353,28 +378,25 @@ static ssize_t disk_show(struct kset *kset, char *buf)
353 int i; 378 int i;
354 char *start = buf; 379 char *start = buf;
355 380
356 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 381 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
357 if (!pm_disk_modes[i]) 382 if (!hibernation_modes[i])
358 continue; 383 continue;
359 switch (i) { 384 switch (i) {
360 case PM_DISK_SHUTDOWN: 385 case HIBERNATION_SHUTDOWN:
361 case PM_DISK_REBOOT: 386 case HIBERNATION_REBOOT:
362 case PM_DISK_TEST: 387 case HIBERNATION_TEST:
363 case PM_DISK_TESTPROC: 388 case HIBERNATION_TESTPROC:
364 break; 389 break;
365 default: 390 case HIBERNATION_PLATFORM:
366 if (pm_ops && pm_ops->enter && 391 if (hibernation_ops)
367 (i == pm_ops->pm_disk_mode))
368 break; 392 break;
369 /* not a valid mode, continue with loop */ 393 /* not a valid mode, continue with loop */
370 continue; 394 continue;
371 } 395 }
372 if (i == pm_disk_mode) 396 if (i == hibernation_mode)
373 buf += sprintf(buf, "[%s]", pm_disk_modes[i]); 397 buf += sprintf(buf, "[%s] ", hibernation_modes[i]);
374 else 398 else
375 buf += sprintf(buf, "%s", pm_disk_modes[i]); 399 buf += sprintf(buf, "%s ", hibernation_modes[i]);
376 if (i+1 != PM_DISK_MAX)
377 buf += sprintf(buf, " ");
378 } 400 }
379 buf += sprintf(buf, "\n"); 401 buf += sprintf(buf, "\n");
380 return buf-start; 402 return buf-start;
@@ -387,39 +409,38 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
387 int i; 409 int i;
388 int len; 410 int len;
389 char *p; 411 char *p;
390 suspend_disk_method_t mode = 0; 412 int mode = HIBERNATION_INVALID;
391 413
392 p = memchr(buf, '\n', n); 414 p = memchr(buf, '\n', n);
393 len = p ? p - buf : n; 415 len = p ? p - buf : n;
394 416
395 mutex_lock(&pm_mutex); 417 mutex_lock(&pm_mutex);
396 for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) { 418 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
397 if (!strncmp(buf, pm_disk_modes[i], len)) { 419 if (!strncmp(buf, hibernation_modes[i], len)) {
398 mode = i; 420 mode = i;
399 break; 421 break;
400 } 422 }
401 } 423 }
402 if (mode) { 424 if (mode != HIBERNATION_INVALID) {
403 switch (mode) { 425 switch (mode) {
404 case PM_DISK_SHUTDOWN: 426 case HIBERNATION_SHUTDOWN:
405 case PM_DISK_REBOOT: 427 case HIBERNATION_REBOOT:
406 case PM_DISK_TEST: 428 case HIBERNATION_TEST:
407 case PM_DISK_TESTPROC: 429 case HIBERNATION_TESTPROC:
408 pm_disk_mode = mode; 430 hibernation_mode = mode;
409 break; 431 break;
410 default: 432 case HIBERNATION_PLATFORM:
411 if (pm_ops && pm_ops->enter && 433 if (hibernation_ops)
412 (mode == pm_ops->pm_disk_mode)) 434 hibernation_mode = mode;
413 pm_disk_mode = mode;
414 else 435 else
415 error = -EINVAL; 436 error = -EINVAL;
416 } 437 }
417 } else { 438 } else
418 error = -EINVAL; 439 error = -EINVAL;
419 }
420 440
421 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 441 if (!error)
422 pm_disk_modes[mode]); 442 pr_debug("PM: suspend-to-disk mode set to '%s'\n",
443 hibernation_modes[mode]);
423 mutex_unlock(&pm_mutex); 444 mutex_unlock(&pm_mutex);
424 return error ? error : n; 445 return error ? error : n;
425} 446}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6dda685e7e2..40d56a31245e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,6 @@
30DEFINE_MUTEX(pm_mutex); 30DEFINE_MUTEX(pm_mutex);
31 31
32struct pm_ops *pm_ops; 32struct pm_ops *pm_ops;
33suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
34 33
35/** 34/**
36 * pm_set_ops - Set the global power method table. 35 * pm_set_ops - Set the global power method table.
@@ -41,10 +40,6 @@ void pm_set_ops(struct pm_ops * ops)
41{ 40{
42 mutex_lock(&pm_mutex); 41 mutex_lock(&pm_mutex);
43 pm_ops = ops; 42 pm_ops = ops;
44 if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
45 pm_disk_mode = ops->pm_disk_mode;
46 } else
47 pm_disk_mode = PM_DISK_SHUTDOWN;
48 mutex_unlock(&pm_mutex); 43 mutex_unlock(&pm_mutex);
49} 44}
50 45
@@ -184,24 +179,12 @@ static void suspend_finish(suspend_state_t state)
184static const char * const pm_states[PM_SUSPEND_MAX] = { 179static const char * const pm_states[PM_SUSPEND_MAX] = {
185 [PM_SUSPEND_STANDBY] = "standby", 180 [PM_SUSPEND_STANDBY] = "standby",
186 [PM_SUSPEND_MEM] = "mem", 181 [PM_SUSPEND_MEM] = "mem",
187 [PM_SUSPEND_DISK] = "disk",
188}; 182};
189 183
190static inline int valid_state(suspend_state_t state) 184static inline int valid_state(suspend_state_t state)
191{ 185{
192 /* Suspend-to-disk does not really need low-level support. 186 /* All states need lowlevel support and need to be valid
193 * It can work with shutdown/reboot if needed. If it isn't 187 * to the lowlevel implementation, no valid callback
194 * configured, then it cannot be supported.
195 */
196 if (state == PM_SUSPEND_DISK)
197#ifdef CONFIG_SOFTWARE_SUSPEND
198 return 1;
199#else
200 return 0;
201#endif
202
203 /* all other states need lowlevel support and need to be
204 * valid to the lowlevel implementation, no valid callback
205 * implies that none are valid. */ 188 * implies that none are valid. */
206 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state)) 189 if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
207 return 0; 190 return 0;
@@ -229,11 +212,6 @@ static int enter_state(suspend_state_t state)
229 if (!mutex_trylock(&pm_mutex)) 212 if (!mutex_trylock(&pm_mutex))
230 return -EBUSY; 213 return -EBUSY;
231 214
232 if (state == PM_SUSPEND_DISK) {
233 error = pm_suspend_disk();
234 goto Unlock;
235 }
236
237 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 215 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
238 if ((error = suspend_prepare(state))) 216 if ((error = suspend_prepare(state)))
239 goto Unlock; 217 goto Unlock;
@@ -251,7 +229,7 @@ static int enter_state(suspend_state_t state)
251 229
252/** 230/**
253 * pm_suspend - Externally visible function for suspending system. 231 * pm_suspend - Externally visible function for suspending system.
254 * @state: Enumarted value of state to enter. 232 * @state: Enumerated value of state to enter.
255 * 233 *
256 * Determine whether or not value is within range, get state 234 * Determine whether or not value is within range, get state
257 * structure, and enter (above). 235 * structure, and enter (above).
@@ -289,7 +267,13 @@ static ssize_t state_show(struct kset *kset, char *buf)
289 if (pm_states[i] && valid_state(i)) 267 if (pm_states[i] && valid_state(i))
290 s += sprintf(s,"%s ", pm_states[i]); 268 s += sprintf(s,"%s ", pm_states[i]);
291 } 269 }
292 s += sprintf(s,"\n"); 270#ifdef CONFIG_SOFTWARE_SUSPEND
271 s += sprintf(s, "%s\n", "disk");
272#else
273 if (s != buf)
274 /* convert the last space to a newline */
275 *(s-1) = '\n';
276#endif
293 return (s - buf); 277 return (s - buf);
294} 278}
295 279
@@ -304,6 +288,12 @@ static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
304 p = memchr(buf, '\n', n); 288 p = memchr(buf, '\n', n);
305 len = p ? p - buf : n; 289 len = p ? p - buf : n;
306 290
291 /* First, check if we are requested to hibernate */
292 if (!strncmp(buf, "disk", len)) {
293 error = hibernate();
294 return error ? error : n;
295 }
296
307 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 297 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
308 if (*s && !strncmp(buf, *s, len)) 298 if (*s && !strncmp(buf, *s, len))
309 break; 299 break;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 34b43542785a..51381487103f 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -25,12 +25,7 @@ struct swsusp_info {
25 */ 25 */
26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) 26#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
27 27
28extern int pm_suspend_disk(void); 28extern struct hibernation_ops *hibernation_ops;
29#else
30static inline int pm_suspend_disk(void)
31{
32 return -EPERM;
33}
34#endif 29#endif
35 30
36extern int pfn_is_nosave(unsigned long); 31extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b7039772b05c..48383ea72290 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1227,7 +1227,7 @@ asmlinkage int swsusp_save(void)
1227 nr_copy_pages = nr_pages; 1227 nr_copy_pages = nr_pages;
1228 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); 1228 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
1229 1229
1230 printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); 1230 printk("swsusp: critical section: done (%d pages copied)\n", nr_pages);
1231 1231
1232 return 0; 1232 return 0;
1233} 1233}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 040560d9c312..24d7d78e6f42 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -130,16 +130,16 @@ static inline int platform_prepare(void)
130{ 130{
131 int error = 0; 131 int error = 0;
132 132
133 if (pm_ops && pm_ops->prepare) 133 if (hibernation_ops)
134 error = pm_ops->prepare(PM_SUSPEND_DISK); 134 error = hibernation_ops->prepare();
135 135
136 return error; 136 return error;
137} 137}
138 138
139static inline void platform_finish(void) 139static inline void platform_finish(void)
140{ 140{
141 if (pm_ops && pm_ops->finish) 141 if (hibernation_ops)
142 pm_ops->finish(PM_SUSPEND_DISK); 142 hibernation_ops->finish();
143} 143}
144 144
145static inline int snapshot_suspend(int platform_suspend) 145static inline int snapshot_suspend(int platform_suspend)
@@ -384,7 +384,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
384 switch (arg) { 384 switch (arg) {
385 385
386 case PMOPS_PREPARE: 386 case PMOPS_PREPARE:
387 if (pm_ops && pm_ops->enter) { 387 if (hibernation_ops) {
388 data->platform_suspend = 1; 388 data->platform_suspend = 1;
389 error = 0; 389 error = 0;
390 } else { 390 } else {
@@ -395,8 +395,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
395 case PMOPS_ENTER: 395 case PMOPS_ENTER:
396 if (data->platform_suspend) { 396 if (data->platform_suspend) {
397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); 397 kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
398 error = pm_ops->enter(PM_SUSPEND_DISK); 398 error = hibernation_ops->enter();
399 error = 0;
400 } 399 }
401 break; 400 break;
402 401
diff --git a/kernel/profile.c b/kernel/profile.c
index 9bfadb248dd8..cc91b9bf759d 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -340,6 +340,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
340 340
341 switch (action) { 341 switch (action) {
342 case CPU_UP_PREPARE: 342 case CPU_UP_PREPARE:
343 case CPU_UP_PREPARE_FROZEN:
343 node = cpu_to_node(cpu); 344 node = cpu_to_node(cpu);
344 per_cpu(cpu_profile_flip, cpu) = 0; 345 per_cpu(cpu_profile_flip, cpu) = 0;
345 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 346 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
@@ -365,10 +366,13 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
365 __free_page(page); 366 __free_page(page);
366 return NOTIFY_BAD; 367 return NOTIFY_BAD;
367 case CPU_ONLINE: 368 case CPU_ONLINE:
369 case CPU_ONLINE_FROZEN:
368 cpu_set(cpu, prof_cpu_mask); 370 cpu_set(cpu, prof_cpu_mask);
369 break; 371 break;
370 case CPU_UP_CANCELED: 372 case CPU_UP_CANCELED:
373 case CPU_UP_CANCELED_FROZEN:
371 case CPU_DEAD: 374 case CPU_DEAD:
375 case CPU_DEAD_FROZEN:
372 cpu_clear(cpu, prof_cpu_mask); 376 cpu_clear(cpu, prof_cpu_mask);
373 if (per_cpu(cpu_profile_hits, cpu)[0]) { 377 if (per_cpu(cpu_profile_hits, cpu)[0]) {
374 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); 378 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 3554b76da84c..2c2dd8410dc4 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -558,9 +558,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
558 long cpu = (long)hcpu; 558 long cpu = (long)hcpu;
559 switch (action) { 559 switch (action) {
560 case CPU_UP_PREPARE: 560 case CPU_UP_PREPARE:
561 case CPU_UP_PREPARE_FROZEN:
561 rcu_online_cpu(cpu); 562 rcu_online_cpu(cpu);
562 break; 563 break;
563 case CPU_DEAD: 564 case CPU_DEAD:
565 case CPU_DEAD_FROZEN:
564 rcu_offline_cpu(cpu); 566 rcu_offline_cpu(cpu);
565 break; 567 break;
566 default: 568 default:
diff --git a/kernel/relay.c b/kernel/relay.c
index d24395e8b6e5..4311101b0ca7 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -310,16 +310,13 @@ static struct rchan_callbacks default_channel_callbacks = {
310 310
311/** 311/**
312 * wakeup_readers - wake up readers waiting on a channel 312 * wakeup_readers - wake up readers waiting on a channel
313 * @work: work struct that contains the channel buffer 313 * @data: contains the channel buffer
314 * 314 *
315 * This is the work function used to defer reader waking. The 315 * This is the timer function used to defer reader waking.
316 * reason waking is deferred is that calling directly from write
317 * causes problems if you're writing from say the scheduler.
318 */ 316 */
319static void wakeup_readers(struct work_struct *work) 317static void wakeup_readers(unsigned long data)
320{ 318{
321 struct rchan_buf *buf = 319 struct rchan_buf *buf = (struct rchan_buf *)data;
322 container_of(work, struct rchan_buf, wake_readers.work);
323 wake_up_interruptible(&buf->read_wait); 320 wake_up_interruptible(&buf->read_wait);
324} 321}
325 322
@@ -337,11 +334,9 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init)
337 if (init) { 334 if (init) {
338 init_waitqueue_head(&buf->read_wait); 335 init_waitqueue_head(&buf->read_wait);
339 kref_init(&buf->kref); 336 kref_init(&buf->kref);
340 INIT_DELAYED_WORK(&buf->wake_readers, NULL); 337 setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf);
341 } else { 338 } else
342 cancel_delayed_work(&buf->wake_readers); 339 del_timer_sync(&buf->timer);
343 flush_scheduled_work();
344 }
345 340
346 buf->subbufs_produced = 0; 341 buf->subbufs_produced = 0;
347 buf->subbufs_consumed = 0; 342 buf->subbufs_consumed = 0;
@@ -447,8 +442,7 @@ end:
447static void relay_close_buf(struct rchan_buf *buf) 442static void relay_close_buf(struct rchan_buf *buf)
448{ 443{
449 buf->finalized = 1; 444 buf->finalized = 1;
450 cancel_delayed_work(&buf->wake_readers); 445 del_timer_sync(&buf->timer);
451 flush_scheduled_work();
452 kref_put(&buf->kref, relay_remove_buf); 446 kref_put(&buf->kref, relay_remove_buf);
453} 447}
454 448
@@ -490,6 +484,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
490 484
491 switch(action) { 485 switch(action) {
492 case CPU_UP_PREPARE: 486 case CPU_UP_PREPARE:
487 case CPU_UP_PREPARE_FROZEN:
493 mutex_lock(&relay_channels_mutex); 488 mutex_lock(&relay_channels_mutex);
494 list_for_each_entry(chan, &relay_channels, list) { 489 list_for_each_entry(chan, &relay_channels, list) {
495 if (chan->buf[hotcpu]) 490 if (chan->buf[hotcpu])
@@ -506,6 +501,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
506 mutex_unlock(&relay_channels_mutex); 501 mutex_unlock(&relay_channels_mutex);
507 break; 502 break;
508 case CPU_DEAD: 503 case CPU_DEAD:
504 case CPU_DEAD_FROZEN:
509 /* No need to flush the cpu : will be flushed upon 505 /* No need to flush the cpu : will be flushed upon
510 * final relay_flush() call. */ 506 * final relay_flush() call. */
511 break; 507 break;
@@ -608,11 +604,14 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
608 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 604 buf->dentry->d_inode->i_size += buf->chan->subbuf_size -
609 buf->padding[old_subbuf]; 605 buf->padding[old_subbuf];
610 smp_mb(); 606 smp_mb();
611 if (waitqueue_active(&buf->read_wait)) { 607 if (waitqueue_active(&buf->read_wait))
612 PREPARE_DELAYED_WORK(&buf->wake_readers, 608 /*
613 wakeup_readers); 609 * Calling wake_up_interruptible() from here
614 schedule_delayed_work(&buf->wake_readers, 1); 610 * will deadlock if we happen to be logging
615 } 611 * from the scheduler (trying to re-grab
612 * rq->lock), so defer it.
613 */
614 __mod_timer(&buf->timer, jiffies + 1);
616 } 615 }
617 616
618 old = buf->data; 617 old = buf->data;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
56 * state. 56 * state.
57 */ 57 */
58 58
59static void 59void
60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
61 unsigned long mask) 61 unsigned long mask)
62{ 62{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
81} 81}
82 82
83/* 83/*
84 * We can speed up the acquire/release, if the architecture
85 * supports cmpxchg and if there's no debugging state to be set up
86 */
87#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
88# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
89static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90{
91 unsigned long owner, *p = (unsigned long *) &lock->owner;
92
93 do {
94 owner = *p;
95 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
96}
97#else
98# define rt_mutex_cmpxchg(l,c,n) (0)
99static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
100{
101 lock->owner = (struct task_struct *)
102 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
103}
104#endif
105
106/*
107 * Calculate task priority from the waiter list priority 84 * Calculate task priority from the waiter list priority
108 * 85 *
109 * Return task->normal_prio when the waiter list is empty or when 86 * Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
123 * 100 *
124 * This can be both boosting and unboosting. task->pi_lock must be held. 101 * This can be both boosting and unboosting. task->pi_lock must be held.
125 */ 102 */
126static void __rt_mutex_adjust_prio(struct task_struct *task) 103void __rt_mutex_adjust_prio(struct task_struct *task)
127{ 104{
128 int prio = rt_mutex_getprio(task); 105 int prio = rt_mutex_getprio(task);
129 106
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
159 * Decreases task's usage by one - may thus free the task. 136 * Decreases task's usage by one - may thus free the task.
160 * Returns 0 or -EDEADLK. 137 * Returns 0 or -EDEADLK.
161 */ 138 */
162static int rt_mutex_adjust_prio_chain(struct task_struct *task, 139int rt_mutex_adjust_prio_chain(struct task_struct *task,
163 int deadlock_detect, 140 int deadlock_detect,
164 struct rt_mutex *orig_lock, 141 struct rt_mutex *orig_lock,
165 struct rt_mutex_waiter *orig_waiter, 142 struct rt_mutex_waiter *orig_waiter,
166 struct task_struct *top_task) 143 struct task_struct *top_task)
167{ 144{
168 struct rt_mutex *lock; 145 struct rt_mutex *lock;
169 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 146 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
524 * 501 *
525 * Must be called with lock->wait_lock held 502 * Must be called with lock->wait_lock held
526 */ 503 */
527static void remove_waiter(struct rt_mutex *lock, 504void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter) 505 struct rt_mutex_waiter *waiter)
529{ 506{
530 int first = (waiter == rt_mutex_top_waiter(lock)); 507 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock); 508 struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
113} 113}
114 114
115/* 115/*
116 * We can speed up the acquire/release, if the architecture
117 * supports cmpxchg and if there's no debugging state to be set up
118 */
119#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
120# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
121static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
122{
123 unsigned long owner, *p = (unsigned long *) &lock->owner;
124
125 do {
126 owner = *p;
127 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
128}
129#else
130# define rt_mutex_cmpxchg(l,c,n) (0)
131static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
132{
133 lock->owner = (struct task_struct *)
134 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
135}
136#endif
137
138/*
116 * PI-futex support (proxy locking functions, etc.): 139 * PI-futex support (proxy locking functions, etc.):
117 */ 140 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 141extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 143 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 144extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 145 struct task_struct *proxy_owner);
146
147extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
148 unsigned long mask);
149extern void __rt_mutex_adjust_prio(struct task_struct *task);
150extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
151 int deadlock_detect,
152 struct rt_mutex *orig_lock,
153 struct rt_mutex_waiter *orig_waiter,
154 struct task_struct *top_task);
155extern void remove_waiter(struct rt_mutex *lock,
156 struct rt_mutex_waiter *waiter);
123#endif 157#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 66bd7ff23f18..799d23b4e35d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -305,6 +305,7 @@ struct rq {
305}; 305};
306 306
307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; 307static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
308static DEFINE_MUTEX(sched_hotcpu_mutex);
308 309
309static inline int cpu_of(struct rq *rq) 310static inline int cpu_of(struct rq *rq)
310{ 311{
@@ -4520,13 +4521,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4520 struct task_struct *p; 4521 struct task_struct *p;
4521 int retval; 4522 int retval;
4522 4523
4523 lock_cpu_hotplug(); 4524 mutex_lock(&sched_hotcpu_mutex);
4524 read_lock(&tasklist_lock); 4525 read_lock(&tasklist_lock);
4525 4526
4526 p = find_process_by_pid(pid); 4527 p = find_process_by_pid(pid);
4527 if (!p) { 4528 if (!p) {
4528 read_unlock(&tasklist_lock); 4529 read_unlock(&tasklist_lock);
4529 unlock_cpu_hotplug(); 4530 mutex_unlock(&sched_hotcpu_mutex);
4530 return -ESRCH; 4531 return -ESRCH;
4531 } 4532 }
4532 4533
@@ -4553,7 +4554,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4553 4554
4554out_unlock: 4555out_unlock:
4555 put_task_struct(p); 4556 put_task_struct(p);
4556 unlock_cpu_hotplug(); 4557 mutex_unlock(&sched_hotcpu_mutex);
4557 return retval; 4558 return retval;
4558} 4559}
4559 4560
@@ -4610,7 +4611,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4610 struct task_struct *p; 4611 struct task_struct *p;
4611 int retval; 4612 int retval;
4612 4613
4613 lock_cpu_hotplug(); 4614 mutex_lock(&sched_hotcpu_mutex);
4614 read_lock(&tasklist_lock); 4615 read_lock(&tasklist_lock);
4615 4616
4616 retval = -ESRCH; 4617 retval = -ESRCH;
@@ -4626,7 +4627,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4626 4627
4627out_unlock: 4628out_unlock:
4628 read_unlock(&tasklist_lock); 4629 read_unlock(&tasklist_lock);
4629 unlock_cpu_hotplug(); 4630 mutex_unlock(&sched_hotcpu_mutex);
4630 if (retval) 4631 if (retval)
4631 return retval; 4632 return retval;
4632 4633
@@ -5388,7 +5389,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5388 struct rq *rq; 5389 struct rq *rq;
5389 5390
5390 switch (action) { 5391 switch (action) {
5392 case CPU_LOCK_ACQUIRE:
5393 mutex_lock(&sched_hotcpu_mutex);
5394 break;
5395
5391 case CPU_UP_PREPARE: 5396 case CPU_UP_PREPARE:
5397 case CPU_UP_PREPARE_FROZEN:
5392 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5398 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
5393 if (IS_ERR(p)) 5399 if (IS_ERR(p))
5394 return NOTIFY_BAD; 5400 return NOTIFY_BAD;
@@ -5402,12 +5408,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5402 break; 5408 break;
5403 5409
5404 case CPU_ONLINE: 5410 case CPU_ONLINE:
5411 case CPU_ONLINE_FROZEN:
5405 /* Strictly unneccessary, as first user will wake it. */ 5412 /* Strictly unneccessary, as first user will wake it. */
5406 wake_up_process(cpu_rq(cpu)->migration_thread); 5413 wake_up_process(cpu_rq(cpu)->migration_thread);
5407 break; 5414 break;
5408 5415
5409#ifdef CONFIG_HOTPLUG_CPU 5416#ifdef CONFIG_HOTPLUG_CPU
5410 case CPU_UP_CANCELED: 5417 case CPU_UP_CANCELED:
5418 case CPU_UP_CANCELED_FROZEN:
5411 if (!cpu_rq(cpu)->migration_thread) 5419 if (!cpu_rq(cpu)->migration_thread)
5412 break; 5420 break;
5413 /* Unbind it from offline cpu so it can run. Fall thru. */ 5421 /* Unbind it from offline cpu so it can run. Fall thru. */
@@ -5418,6 +5426,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5418 break; 5426 break;
5419 5427
5420 case CPU_DEAD: 5428 case CPU_DEAD:
5429 case CPU_DEAD_FROZEN:
5421 migrate_live_tasks(cpu); 5430 migrate_live_tasks(cpu);
5422 rq = cpu_rq(cpu); 5431 rq = cpu_rq(cpu);
5423 kthread_stop(rq->migration_thread); 5432 kthread_stop(rq->migration_thread);
@@ -5433,7 +5442,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5433 BUG_ON(rq->nr_running != 0); 5442 BUG_ON(rq->nr_running != 0);
5434 5443
5435 /* No need to migrate the tasks: it was best-effort if 5444 /* No need to migrate the tasks: it was best-effort if
5436 * they didn't do lock_cpu_hotplug(). Just wake up 5445 * they didn't take sched_hotcpu_mutex. Just wake up
5437 * the requestors. */ 5446 * the requestors. */
5438 spin_lock_irq(&rq->lock); 5447 spin_lock_irq(&rq->lock);
5439 while (!list_empty(&rq->migration_queue)) { 5448 while (!list_empty(&rq->migration_queue)) {
@@ -5447,6 +5456,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5447 spin_unlock_irq(&rq->lock); 5456 spin_unlock_irq(&rq->lock);
5448 break; 5457 break;
5449#endif 5458#endif
5459 case CPU_LOCK_RELEASE:
5460 mutex_unlock(&sched_hotcpu_mutex);
5461 break;
5450 } 5462 }
5451 return NOTIFY_OK; 5463 return NOTIFY_OK;
5452} 5464}
@@ -6822,10 +6834,10 @@ int arch_reinit_sched_domains(void)
6822{ 6834{
6823 int err; 6835 int err;
6824 6836
6825 lock_cpu_hotplug(); 6837 mutex_lock(&sched_hotcpu_mutex);
6826 detach_destroy_domains(&cpu_online_map); 6838 detach_destroy_domains(&cpu_online_map);
6827 err = arch_init_sched_domains(&cpu_online_map); 6839 err = arch_init_sched_domains(&cpu_online_map);
6828 unlock_cpu_hotplug(); 6840 mutex_unlock(&sched_hotcpu_mutex);
6829 6841
6830 return err; 6842 return err;
6831} 6843}
@@ -6904,14 +6916,20 @@ static int update_sched_domains(struct notifier_block *nfb,
6904{ 6916{
6905 switch (action) { 6917 switch (action) {
6906 case CPU_UP_PREPARE: 6918 case CPU_UP_PREPARE:
6919 case CPU_UP_PREPARE_FROZEN:
6907 case CPU_DOWN_PREPARE: 6920 case CPU_DOWN_PREPARE:
6921 case CPU_DOWN_PREPARE_FROZEN:
6908 detach_destroy_domains(&cpu_online_map); 6922 detach_destroy_domains(&cpu_online_map);
6909 return NOTIFY_OK; 6923 return NOTIFY_OK;
6910 6924
6911 case CPU_UP_CANCELED: 6925 case CPU_UP_CANCELED:
6926 case CPU_UP_CANCELED_FROZEN:
6912 case CPU_DOWN_FAILED: 6927 case CPU_DOWN_FAILED:
6928 case CPU_DOWN_FAILED_FROZEN:
6913 case CPU_ONLINE: 6929 case CPU_ONLINE:
6930 case CPU_ONLINE_FROZEN:
6914 case CPU_DEAD: 6931 case CPU_DEAD:
6932 case CPU_DEAD_FROZEN:
6915 /* 6933 /*
6916 * Fall through and re-initialise the domains. 6934 * Fall through and re-initialise the domains.
6917 */ 6935 */
@@ -6930,12 +6948,12 @@ void __init sched_init_smp(void)
6930{ 6948{
6931 cpumask_t non_isolated_cpus; 6949 cpumask_t non_isolated_cpus;
6932 6950
6933 lock_cpu_hotplug(); 6951 mutex_lock(&sched_hotcpu_mutex);
6934 arch_init_sched_domains(&cpu_online_map); 6952 arch_init_sched_domains(&cpu_online_map);
6935 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 6953 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6936 if (cpus_empty(non_isolated_cpus)) 6954 if (cpus_empty(non_isolated_cpus))
6937 cpu_set(smp_processor_id(), non_isolated_cpus); 6955 cpu_set(smp_processor_id(), non_isolated_cpus);
6938 unlock_cpu_hotplug(); 6956 mutex_unlock(&sched_hotcpu_mutex);
6939 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6957 /* XXX: Theoretical race here - CPU may be hotplugged now */
6940 hotcpu_notifier(update_sched_domains, 0); 6958 hotcpu_notifier(update_sched_domains, 0);
6941 6959
diff --git a/kernel/signal.c b/kernel/signal.c
index 1368e67c8482..2ac3a668d9dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -38,125 +38,6 @@
38 38
39static struct kmem_cache *sigqueue_cachep; 39static struct kmem_cache *sigqueue_cachep;
40 40
41/*
42 * In POSIX a signal is sent either to a specific thread (Linux task)
43 * or to the process as a whole (Linux thread group). How the signal
44 * is sent determines whether it's to one thread or the whole group,
45 * which determines which signal mask(s) are involved in blocking it
46 * from being delivered until later. When the signal is delivered,
47 * either it's caught or ignored by a user handler or it has a default
48 * effect that applies to the whole thread group (POSIX process).
49 *
50 * The possible effects an unblocked signal set to SIG_DFL can have are:
51 * ignore - Nothing Happens
52 * terminate - kill the process, i.e. all threads in the group,
53 * similar to exit_group. The group leader (only) reports
54 * WIFSIGNALED status to its parent.
55 * coredump - write a core dump file describing all threads using
56 * the same mm and then kill all those threads
57 * stop - stop all the threads in the group, i.e. TASK_STOPPED state
58 *
59 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
60 * Other signals when not blocked and set to SIG_DFL behaves as follows.
61 * The job control signals also have other special effects.
62 *
63 * +--------------------+------------------+
64 * | POSIX signal | default action |
65 * +--------------------+------------------+
66 * | SIGHUP | terminate |
67 * | SIGINT | terminate |
68 * | SIGQUIT | coredump |
69 * | SIGILL | coredump |
70 * | SIGTRAP | coredump |
71 * | SIGABRT/SIGIOT | coredump |
72 * | SIGBUS | coredump |
73 * | SIGFPE | coredump |
74 * | SIGKILL | terminate(+) |
75 * | SIGUSR1 | terminate |
76 * | SIGSEGV | coredump |
77 * | SIGUSR2 | terminate |
78 * | SIGPIPE | terminate |
79 * | SIGALRM | terminate |
80 * | SIGTERM | terminate |
81 * | SIGCHLD | ignore |
82 * | SIGCONT | ignore(*) |
83 * | SIGSTOP | stop(*)(+) |
84 * | SIGTSTP | stop(*) |
85 * | SIGTTIN | stop(*) |
86 * | SIGTTOU | stop(*) |
87 * | SIGURG | ignore |
88 * | SIGXCPU | coredump |
89 * | SIGXFSZ | coredump |
90 * | SIGVTALRM | terminate |
91 * | SIGPROF | terminate |
92 * | SIGPOLL/SIGIO | terminate |
93 * | SIGSYS/SIGUNUSED | coredump |
94 * | SIGSTKFLT | terminate |
95 * | SIGWINCH | ignore |
96 * | SIGPWR | terminate |
97 * | SIGRTMIN-SIGRTMAX | terminate |
98 * +--------------------+------------------+
99 * | non-POSIX signal | default action |
100 * +--------------------+------------------+
101 * | SIGEMT | coredump |
102 * +--------------------+------------------+
103 *
104 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
105 * (*) Special job control effects:
106 * When SIGCONT is sent, it resumes the process (all threads in the group)
107 * from TASK_STOPPED state and also clears any pending/queued stop signals
108 * (any of those marked with "stop(*)"). This happens regardless of blocking,
109 * catching, or ignoring SIGCONT. When any stop signal is sent, it clears
110 * any pending/queued SIGCONT signals; this happens regardless of blocking,
111 * catching, or ignored the stop signal, though (except for SIGSTOP) the
112 * default action of stopping the process may happen later or never.
113 */
114
115#ifdef SIGEMT
116#define M_SIGEMT M(SIGEMT)
117#else
118#define M_SIGEMT 0
119#endif
120
121#if SIGRTMIN > BITS_PER_LONG
122#define M(sig) (1ULL << ((sig)-1))
123#else
124#define M(sig) (1UL << ((sig)-1))
125#endif
126#define T(sig, mask) (M(sig) & (mask))
127
128#define SIG_KERNEL_ONLY_MASK (\
129 M(SIGKILL) | M(SIGSTOP) )
130
131#define SIG_KERNEL_STOP_MASK (\
132 M(SIGSTOP) | M(SIGTSTP) | M(SIGTTIN) | M(SIGTTOU) )
133
134#define SIG_KERNEL_COREDUMP_MASK (\
135 M(SIGQUIT) | M(SIGILL) | M(SIGTRAP) | M(SIGABRT) | \
136 M(SIGFPE) | M(SIGSEGV) | M(SIGBUS) | M(SIGSYS) | \
137 M(SIGXCPU) | M(SIGXFSZ) | M_SIGEMT )
138
139#define SIG_KERNEL_IGNORE_MASK (\
140 M(SIGCONT) | M(SIGCHLD) | M(SIGWINCH) | M(SIGURG) )
141
142#define sig_kernel_only(sig) \
143 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_ONLY_MASK))
144#define sig_kernel_coredump(sig) \
145 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_COREDUMP_MASK))
146#define sig_kernel_ignore(sig) \
147 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_IGNORE_MASK))
148#define sig_kernel_stop(sig) \
149 (((sig) < SIGRTMIN) && T(sig, SIG_KERNEL_STOP_MASK))
150
151#define sig_needs_tasklist(sig) ((sig) == SIGCONT)
152
153#define sig_user_defined(t, signr) \
154 (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \
155 ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN))
156
157#define sig_fatal(t, signr) \
158 (!T(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
159 (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)
160 41
161static int sig_ignored(struct task_struct *t, int sig) 42static int sig_ignored(struct task_struct *t, int sig)
162{ 43{
@@ -328,6 +209,16 @@ void flush_signals(struct task_struct *t)
328 spin_unlock_irqrestore(&t->sighand->siglock, flags); 209 spin_unlock_irqrestore(&t->sighand->siglock, flags);
329} 210}
330 211
212void ignore_signals(struct task_struct *t)
213{
214 int i;
215
216 for (i = 0; i < _NSIG; ++i)
217 t->sighand->action[i].sa.sa_handler = SIG_IGN;
218
219 flush_signals(t);
220}
221
331/* 222/*
332 * Flush all handlers for a task. 223 * Flush all handlers for a task.
333 */ 224 */
@@ -1032,17 +923,6 @@ void zap_other_threads(struct task_struct *p)
1032 if (t->exit_state) 923 if (t->exit_state)
1033 continue; 924 continue;
1034 925
1035 /*
1036 * We don't want to notify the parent, since we are
1037 * killed as part of a thread group due to another
1038 * thread doing an execve() or similar. So set the
1039 * exit signal to -1 to allow immediate reaping of
1040 * the process. But don't detach the thread group
1041 * leader.
1042 */
1043 if (t != p->group_leader)
1044 t->exit_signal = -1;
1045
1046 /* SIGKILL will be handled before any pending SIGSTOP */ 926 /* SIGKILL will be handled before any pending SIGSTOP */
1047 sigaddset(&t->pending.signal, SIGKILL); 927 sigaddset(&t->pending.signal, SIGKILL);
1048 signal_wake_up(t, 1); 928 signal_wake_up(t, 1);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 8b75008e2bd8..0b9886a00e74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -593,6 +593,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
593 593
594 switch (action) { 594 switch (action) {
595 case CPU_UP_PREPARE: 595 case CPU_UP_PREPARE:
596 case CPU_UP_PREPARE_FROZEN:
596 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 597 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
597 if (IS_ERR(p)) { 598 if (IS_ERR(p)) {
598 printk("ksoftirqd for %i failed\n", hotcpu); 599 printk("ksoftirqd for %i failed\n", hotcpu);
@@ -602,16 +603,19 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
602 per_cpu(ksoftirqd, hotcpu) = p; 603 per_cpu(ksoftirqd, hotcpu) = p;
603 break; 604 break;
604 case CPU_ONLINE: 605 case CPU_ONLINE:
606 case CPU_ONLINE_FROZEN:
605 wake_up_process(per_cpu(ksoftirqd, hotcpu)); 607 wake_up_process(per_cpu(ksoftirqd, hotcpu));
606 break; 608 break;
607#ifdef CONFIG_HOTPLUG_CPU 609#ifdef CONFIG_HOTPLUG_CPU
608 case CPU_UP_CANCELED: 610 case CPU_UP_CANCELED:
611 case CPU_UP_CANCELED_FROZEN:
609 if (!per_cpu(ksoftirqd, hotcpu)) 612 if (!per_cpu(ksoftirqd, hotcpu))
610 break; 613 break;
611 /* Unbind so it can run. Fall thru. */ 614 /* Unbind so it can run. Fall thru. */
612 kthread_bind(per_cpu(ksoftirqd, hotcpu), 615 kthread_bind(per_cpu(ksoftirqd, hotcpu),
613 any_online_cpu(cpu_online_map)); 616 any_online_cpu(cpu_online_map));
614 case CPU_DEAD: 617 case CPU_DEAD:
618 case CPU_DEAD_FROZEN:
615 p = per_cpu(ksoftirqd, hotcpu); 619 p = per_cpu(ksoftirqd, hotcpu);
616 per_cpu(ksoftirqd, hotcpu) = NULL; 620 per_cpu(ksoftirqd, hotcpu) = NULL;
617 kthread_stop(p); 621 kthread_stop(p);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 8fa7040247ad..0131e296ffb4 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -146,6 +146,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
146 146
147 switch (action) { 147 switch (action) {
148 case CPU_UP_PREPARE: 148 case CPU_UP_PREPARE:
149 case CPU_UP_PREPARE_FROZEN:
149 BUG_ON(per_cpu(watchdog_task, hotcpu)); 150 BUG_ON(per_cpu(watchdog_task, hotcpu));
150 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 151 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
151 if (IS_ERR(p)) { 152 if (IS_ERR(p)) {
@@ -157,16 +158,19 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
157 kthread_bind(p, hotcpu); 158 kthread_bind(p, hotcpu);
158 break; 159 break;
159 case CPU_ONLINE: 160 case CPU_ONLINE:
161 case CPU_ONLINE_FROZEN:
160 wake_up_process(per_cpu(watchdog_task, hotcpu)); 162 wake_up_process(per_cpu(watchdog_task, hotcpu));
161 break; 163 break;
162#ifdef CONFIG_HOTPLUG_CPU 164#ifdef CONFIG_HOTPLUG_CPU
163 case CPU_UP_CANCELED: 165 case CPU_UP_CANCELED:
166 case CPU_UP_CANCELED_FROZEN:
164 if (!per_cpu(watchdog_task, hotcpu)) 167 if (!per_cpu(watchdog_task, hotcpu))
165 break; 168 break;
166 /* Unbind so it can run. Fall thru. */ 169 /* Unbind so it can run. Fall thru. */
167 kthread_bind(per_cpu(watchdog_task, hotcpu), 170 kthread_bind(per_cpu(watchdog_task, hotcpu),
168 any_online_cpu(cpu_online_map)); 171 any_online_cpu(cpu_online_map));
169 case CPU_DEAD: 172 case CPU_DEAD:
173 case CPU_DEAD_FROZEN:
170 p = per_cpu(watchdog_task, hotcpu); 174 p = per_cpu(watchdog_task, hotcpu);
171 per_cpu(watchdog_task, hotcpu) = NULL; 175 per_cpu(watchdog_task, hotcpu) = NULL;
172 kthread_stop(p); 176 kthread_stop(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0742c938dfa7..cdb7e9457ba6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -134,19 +134,39 @@ static int notifier_chain_unregister(struct notifier_block **nl,
134 return -ENOENT; 134 return -ENOENT;
135} 135}
136 136
137/**
138 * notifier_call_chain - Informs the registered notifiers about an event.
139 * @nl: Pointer to head of the blocking notifier chain
140 * @val: Value passed unmodified to notifier function
141 * @v: Pointer passed unmodified to notifier function
142 * @nr_to_call: Number of notifier functions to be called. Don't care
143 * value of this parameter is -1.
144 * @nr_calls: Records the number of notifications sent. Don't care
145 * value of this field is NULL.
146 * @returns: notifier_call_chain returns the value returned by the
147 * last notifier function called.
148 */
149
137static int __kprobes notifier_call_chain(struct notifier_block **nl, 150static int __kprobes notifier_call_chain(struct notifier_block **nl,
138 unsigned long val, void *v) 151 unsigned long val, void *v,
152 int nr_to_call, int *nr_calls)
139{ 153{
140 int ret = NOTIFY_DONE; 154 int ret = NOTIFY_DONE;
141 struct notifier_block *nb, *next_nb; 155 struct notifier_block *nb, *next_nb;
142 156
143 nb = rcu_dereference(*nl); 157 nb = rcu_dereference(*nl);
144 while (nb) { 158
159 while (nb && nr_to_call) {
145 next_nb = rcu_dereference(nb->next); 160 next_nb = rcu_dereference(nb->next);
146 ret = nb->notifier_call(nb, val, v); 161 ret = nb->notifier_call(nb, val, v);
162
163 if (nr_calls)
164 (*nr_calls)++;
165
147 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 166 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
148 break; 167 break;
149 nb = next_nb; 168 nb = next_nb;
169 nr_to_call--;
150 } 170 }
151 return ret; 171 return ret;
152} 172}
@@ -205,10 +225,12 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
205EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); 225EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
206 226
207/** 227/**
208 * atomic_notifier_call_chain - Call functions in an atomic notifier chain 228 * __atomic_notifier_call_chain - Call functions in an atomic notifier chain
209 * @nh: Pointer to head of the atomic notifier chain 229 * @nh: Pointer to head of the atomic notifier chain
210 * @val: Value passed unmodified to notifier function 230 * @val: Value passed unmodified to notifier function
211 * @v: Pointer passed unmodified to notifier function 231 * @v: Pointer passed unmodified to notifier function
232 * @nr_to_call: See the comment for notifier_call_chain.
233 * @nr_calls: See the comment for notifier_call_chain.
212 * 234 *
213 * Calls each function in a notifier chain in turn. The functions 235 * Calls each function in a notifier chain in turn. The functions
214 * run in an atomic context, so they must not block. 236 * run in an atomic context, so they must not block.
@@ -222,19 +244,27 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
222 * of the last notifier function called. 244 * of the last notifier function called.
223 */ 245 */
224 246
225int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, 247int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
226 unsigned long val, void *v) 248 unsigned long val, void *v,
249 int nr_to_call, int *nr_calls)
227{ 250{
228 int ret; 251 int ret;
229 252
230 rcu_read_lock(); 253 rcu_read_lock();
231 ret = notifier_call_chain(&nh->head, val, v); 254 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
232 rcu_read_unlock(); 255 rcu_read_unlock();
233 return ret; 256 return ret;
234} 257}
235 258
236EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); 259EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
260
261int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
262 unsigned long val, void *v)
263{
264 return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
265}
237 266
267EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
238/* 268/*
239 * Blocking notifier chain routines. All access to the chain is 269 * Blocking notifier chain routines. All access to the chain is
240 * synchronized by an rwsem. 270 * synchronized by an rwsem.
@@ -304,10 +334,12 @@ int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
304EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); 334EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
305 335
306/** 336/**
307 * blocking_notifier_call_chain - Call functions in a blocking notifier chain 337 * __blocking_notifier_call_chain - Call functions in a blocking notifier chain
308 * @nh: Pointer to head of the blocking notifier chain 338 * @nh: Pointer to head of the blocking notifier chain
309 * @val: Value passed unmodified to notifier function 339 * @val: Value passed unmodified to notifier function
310 * @v: Pointer passed unmodified to notifier function 340 * @v: Pointer passed unmodified to notifier function
341 * @nr_to_call: See comment for notifier_call_chain.
342 * @nr_calls: See comment for notifier_call_chain.
311 * 343 *
312 * Calls each function in a notifier chain in turn. The functions 344 * Calls each function in a notifier chain in turn. The functions
313 * run in a process context, so they are allowed to block. 345 * run in a process context, so they are allowed to block.
@@ -320,8 +352,9 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);
320 * of the last notifier function called. 352 * of the last notifier function called.
321 */ 353 */
322 354
323int blocking_notifier_call_chain(struct blocking_notifier_head *nh, 355int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
324 unsigned long val, void *v) 356 unsigned long val, void *v,
357 int nr_to_call, int *nr_calls)
325{ 358{
326 int ret = NOTIFY_DONE; 359 int ret = NOTIFY_DONE;
327 360
@@ -332,12 +365,19 @@ int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
332 */ 365 */
333 if (rcu_dereference(nh->head)) { 366 if (rcu_dereference(nh->head)) {
334 down_read(&nh->rwsem); 367 down_read(&nh->rwsem);
335 ret = notifier_call_chain(&nh->head, val, v); 368 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
369 nr_calls);
336 up_read(&nh->rwsem); 370 up_read(&nh->rwsem);
337 } 371 }
338 return ret; 372 return ret;
339} 373}
374EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain);
340 375
376int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
377 unsigned long val, void *v)
378{
379 return __blocking_notifier_call_chain(nh, val, v, -1, NULL);
380}
341EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); 381EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);
342 382
343/* 383/*
@@ -383,10 +423,12 @@ int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
383EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); 423EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
384 424
385/** 425/**
386 * raw_notifier_call_chain - Call functions in a raw notifier chain 426 * __raw_notifier_call_chain - Call functions in a raw notifier chain
387 * @nh: Pointer to head of the raw notifier chain 427 * @nh: Pointer to head of the raw notifier chain
388 * @val: Value passed unmodified to notifier function 428 * @val: Value passed unmodified to notifier function
389 * @v: Pointer passed unmodified to notifier function 429 * @v: Pointer passed unmodified to notifier function
430 * @nr_to_call: See comment for notifier_call_chain.
431 * @nr_calls: See comment for notifier_call_chain
390 * 432 *
391 * Calls each function in a notifier chain in turn. The functions 433 * Calls each function in a notifier chain in turn. The functions
392 * run in an undefined context. 434 * run in an undefined context.
@@ -400,10 +442,19 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);
400 * of the last notifier function called. 442 * of the last notifier function called.
401 */ 443 */
402 444
445int __raw_notifier_call_chain(struct raw_notifier_head *nh,
446 unsigned long val, void *v,
447 int nr_to_call, int *nr_calls)
448{
449 return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
450}
451
452EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);
453
403int raw_notifier_call_chain(struct raw_notifier_head *nh, 454int raw_notifier_call_chain(struct raw_notifier_head *nh,
404 unsigned long val, void *v) 455 unsigned long val, void *v)
405{ 456{
406 return notifier_call_chain(&nh->head, val, v); 457 return __raw_notifier_call_chain(nh, val, v, -1, NULL);
407} 458}
408 459
409EXPORT_SYMBOL_GPL(raw_notifier_call_chain); 460EXPORT_SYMBOL_GPL(raw_notifier_call_chain);
@@ -478,10 +529,12 @@ int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
478EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); 529EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
479 530
480/** 531/**
481 * srcu_notifier_call_chain - Call functions in an SRCU notifier chain 532 * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain
482 * @nh: Pointer to head of the SRCU notifier chain 533 * @nh: Pointer to head of the SRCU notifier chain
483 * @val: Value passed unmodified to notifier function 534 * @val: Value passed unmodified to notifier function
484 * @v: Pointer passed unmodified to notifier function 535 * @v: Pointer passed unmodified to notifier function
536 * @nr_to_call: See comment for notifier_call_chain.
537 * @nr_calls: See comment for notifier_call_chain
485 * 538 *
486 * Calls each function in a notifier chain in turn. The functions 539 * Calls each function in a notifier chain in turn. The functions
487 * run in a process context, so they are allowed to block. 540 * run in a process context, so they are allowed to block.
@@ -494,18 +547,25 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);
494 * of the last notifier function called. 547 * of the last notifier function called.
495 */ 548 */
496 549
497int srcu_notifier_call_chain(struct srcu_notifier_head *nh, 550int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
498 unsigned long val, void *v) 551 unsigned long val, void *v,
552 int nr_to_call, int *nr_calls)
499{ 553{
500 int ret; 554 int ret;
501 int idx; 555 int idx;
502 556
503 idx = srcu_read_lock(&nh->srcu); 557 idx = srcu_read_lock(&nh->srcu);
504 ret = notifier_call_chain(&nh->head, val, v); 558 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
505 srcu_read_unlock(&nh->srcu, idx); 559 srcu_read_unlock(&nh->srcu, idx);
506 return ret; 560 return ret;
507} 561}
562EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain);
508 563
564int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
565 unsigned long val, void *v)
566{
567 return __srcu_notifier_call_chain(nh, val, v, -1, NULL);
568}
509EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); 569EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);
510 570
511/** 571/**
@@ -881,7 +941,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
881#ifdef CONFIG_SOFTWARE_SUSPEND 941#ifdef CONFIG_SOFTWARE_SUSPEND
882 case LINUX_REBOOT_CMD_SW_SUSPEND: 942 case LINUX_REBOOT_CMD_SW_SUSPEND:
883 { 943 {
884 int ret = pm_suspend(PM_SUSPEND_DISK); 944 int ret = hibernate();
885 unlock_kernel(); 945 unlock_kernel();
886 return ret; 946 return ret;
887 } 947 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f0664bd5011c..4073353abd4f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -77,6 +77,7 @@ extern int sysctl_drop_caches;
77extern int percpu_pagelist_fraction; 77extern int percpu_pagelist_fraction;
78extern int compat_log; 78extern int compat_log;
79extern int maps_protect; 79extern int maps_protect;
80extern int sysctl_stat_interval;
80 81
81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 82/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
82static int maxolduid = 65535; 83static int maxolduid = 65535;
@@ -857,6 +858,17 @@ static ctl_table vm_table[] = {
857 .extra2 = &one_hundred, 858 .extra2 = &one_hundred,
858 }, 859 },
859#endif 860#endif
861#ifdef CONFIG_SMP
862 {
863 .ctl_name = CTL_UNNUMBERED,
864 .procname = "stat_interval",
865 .data = &sysctl_stat_interval,
866 .maxlen = sizeof(sysctl_stat_interval),
867 .mode = 0644,
868 .proc_handler = &proc_dointvec_jiffies,
869 .strategy = &sysctl_jiffies,
870 },
871#endif
860#if defined(CONFIG_X86_32) || \ 872#if defined(CONFIG_X86_32) || \
861 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 873 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
862 { 874 {
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index fe5c7db24247..3db5c3c460d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -74,15 +74,17 @@ static struct clocksource *watchdog;
74static struct timer_list watchdog_timer; 74static struct timer_list watchdog_timer;
75static DEFINE_SPINLOCK(watchdog_lock); 75static DEFINE_SPINLOCK(watchdog_lock);
76static cycle_t watchdog_last; 76static cycle_t watchdog_last;
77static int watchdog_resumed;
78
77/* 79/*
78 * Interval: 0.5sec Treshold: 0.0625s 80 * Interval: 0.5sec Threshold: 0.0625s
79 */ 81 */
80#define WATCHDOG_INTERVAL (HZ >> 1) 82#define WATCHDOG_INTERVAL (HZ >> 1)
81#define WATCHDOG_TRESHOLD (NSEC_PER_SEC >> 4) 83#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
82 84
83static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 85static void clocksource_ratewd(struct clocksource *cs, int64_t delta)
84{ 86{
85 if (delta > -WATCHDOG_TRESHOLD && delta < WATCHDOG_TRESHOLD) 87 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD)
86 return; 88 return;
87 89
88 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 90 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
@@ -98,15 +100,26 @@ static void clocksource_watchdog(unsigned long data)
98 struct clocksource *cs, *tmp; 100 struct clocksource *cs, *tmp;
99 cycle_t csnow, wdnow; 101 cycle_t csnow, wdnow;
100 int64_t wd_nsec, cs_nsec; 102 int64_t wd_nsec, cs_nsec;
103 int resumed;
101 104
102 spin_lock(&watchdog_lock); 105 spin_lock(&watchdog_lock);
103 106
107 resumed = watchdog_resumed;
108 if (unlikely(resumed))
109 watchdog_resumed = 0;
110
104 wdnow = watchdog->read(); 111 wdnow = watchdog->read();
105 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 112 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
106 watchdog_last = wdnow; 113 watchdog_last = wdnow;
107 114
108 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 115 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
109 csnow = cs->read(); 116 csnow = cs->read();
117
118 if (unlikely(resumed)) {
119 cs->wd_last = csnow;
120 continue;
121 }
122
110 /* Initialized ? */ 123 /* Initialized ? */
111 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 124 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
112 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && 125 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
@@ -136,6 +149,13 @@ static void clocksource_watchdog(unsigned long data)
136 } 149 }
137 spin_unlock(&watchdog_lock); 150 spin_unlock(&watchdog_lock);
138} 151}
152static void clocksource_resume_watchdog(void)
153{
154 spin_lock(&watchdog_lock);
155 watchdog_resumed = 1;
156 spin_unlock(&watchdog_lock);
157}
158
139static void clocksource_check_watchdog(struct clocksource *cs) 159static void clocksource_check_watchdog(struct clocksource *cs)
140{ 160{
141 struct clocksource *cse; 161 struct clocksource *cse;
@@ -182,9 +202,34 @@ static void clocksource_check_watchdog(struct clocksource *cs)
182 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 202 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
183 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 203 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
184} 204}
205
206static inline void clocksource_resume_watchdog(void) { }
185#endif 207#endif
186 208
187/** 209/**
210 * clocksource_resume - resume the clocksource(s)
211 */
212void clocksource_resume(void)
213{
214 struct list_head *tmp;
215 unsigned long flags;
216
217 spin_lock_irqsave(&clocksource_lock, flags);
218
219 list_for_each(tmp, &clocksource_list) {
220 struct clocksource *cs;
221
222 cs = list_entry(tmp, struct clocksource, list);
223 if (cs->resume)
224 cs->resume();
225 }
226
227 clocksource_resume_watchdog();
228
229 spin_unlock_irqrestore(&clocksource_lock, flags);
230}
231
232/**
188 * clocksource_get_next - Returns the selected clocksource 233 * clocksource_get_next - Returns the selected clocksource
189 * 234 *
190 */ 235 */
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index b734ca4bc75e..8bbcfb77f7d2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -65,7 +65,7 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 66#endif
67 SEQ_printf(m, "\n"); 67 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Ld nsecs [in %Ld nsecs]\n", 68 SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 69 (unsigned long long)ktime_to_ns(timer->expires),
70 (unsigned long long)(ktime_to_ns(timer->expires) - now)); 70 (unsigned long long)(ktime_to_ns(timer->expires) - now));
71} 71}
@@ -111,14 +111,14 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 111{
112 SEQ_printf(m, " .index: %d\n", 112 SEQ_printf(m, " .index: %d\n",
113 base->index); 113 base->index);
114 SEQ_printf(m, " .resolution: %Ld nsecs\n", 114 SEQ_printf(m, " .resolution: %Lu nsecs\n",
115 (unsigned long long)ktime_to_ns(base->resolution)); 115 (unsigned long long)ktime_to_ns(base->resolution));
116 SEQ_printf(m, " .get_time: "); 116 SEQ_printf(m, " .get_time: ");
117 print_name_offset(m, base->get_time); 117 print_name_offset(m, base->get_time);
118 SEQ_printf(m, "\n"); 118 SEQ_printf(m, "\n");
119#ifdef CONFIG_HIGH_RES_TIMERS 119#ifdef CONFIG_HIGH_RES_TIMERS
120 SEQ_printf(m, " .offset: %Ld nsecs\n", 120 SEQ_printf(m, " .offset: %Lu nsecs\n",
121 ktime_to_ns(base->offset)); 121 (unsigned long long) ktime_to_ns(base->offset));
122#endif 122#endif
123 SEQ_printf(m, "active timers:\n"); 123 SEQ_printf(m, "active timers:\n");
124 print_active_timers(m, base, now); 124 print_active_timers(m, base, now);
@@ -135,10 +135,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
135 print_base(m, cpu_base->clock_base + i, now); 135 print_base(m, cpu_base->clock_base + i, now);
136 } 136 }
137#define P(x) \ 137#define P(x) \
138 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(cpu_base->x)) 138 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
139 (unsigned long long)(cpu_base->x))
139#define P_ns(x) \ 140#define P_ns(x) \
140 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 141 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
141 (u64)(ktime_to_ns(cpu_base->x))) 142 (unsigned long long)(ktime_to_ns(cpu_base->x)))
142 143
143#ifdef CONFIG_HIGH_RES_TIMERS 144#ifdef CONFIG_HIGH_RES_TIMERS
144 P_ns(expires_next); 145 P_ns(expires_next);
@@ -150,10 +151,11 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 151
151#ifdef CONFIG_TICK_ONESHOT 152#ifdef CONFIG_TICK_ONESHOT
152# define P(x) \ 153# define P(x) \
153 SEQ_printf(m, " .%-15s: %Ld\n", #x, (u64)(ts->x)) 154 SEQ_printf(m, " .%-15s: %Lu\n", #x, \
155 (unsigned long long)(ts->x))
154# define P_ns(x) \ 156# define P_ns(x) \
155 SEQ_printf(m, " .%-15s: %Ld nsecs\n", #x, \ 157 SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \
156 (u64)(ktime_to_ns(ts->x))) 158 (unsigned long long)(ktime_to_ns(ts->x)))
157 { 159 {
158 struct tick_sched *ts = tick_get_tick_sched(cpu); 160 struct tick_sched *ts = tick_get_tick_sched(cpu);
159 P(nohz_mode); 161 P(nohz_mode);
@@ -167,7 +169,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
167 P(last_jiffies); 169 P(last_jiffies);
168 P(next_jiffies); 170 P(next_jiffies);
169 P_ns(idle_expires); 171 P_ns(idle_expires);
170 SEQ_printf(m, "jiffies: %Ld\n", (u64)jiffies); 172 SEQ_printf(m, "jiffies: %Lu\n",
173 (unsigned long long)jiffies);
171 } 174 }
172#endif 175#endif
173 176
diff --git a/kernel/timer.c b/kernel/timer.c
index 7a6448340f90..59a28b1752f8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,24 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
92/* Functions below help us manage 'deferrable' flag */ 92/* Functions below help us manage 'deferrable' flag */
93static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 93static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
94{ 94{
95 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 95 return (unsigned int)((unsigned long)base & TBASE_DEFERRABLE_FLAG);
96} 96}
97 97
98static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 98static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
99{ 99{
100 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 100 return (tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG);
101} 101}
102 102
103static inline void timer_set_deferrable(struct timer_list *timer) 103static inline void timer_set_deferrable(struct timer_list *timer)
104{ 104{
105 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 105 timer->base = (tvec_base_t *)((unsigned long)timer->base |
106 TBASE_DEFERRABLE_FLAG)); 106 TBASE_DEFERRABLE_FLAG);
107} 107}
108 108
109static inline void 109static inline void
110timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 110timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
111{ 111{
112 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 112 timer->base = (tvec_base_t *)((unsigned long)new_base |
113 tbase_get_deferrable(timer->base)); 113 tbase_get_deferrable(timer->base));
114} 114}
115 115
@@ -1293,11 +1293,13 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1293 long cpu = (long)hcpu; 1293 long cpu = (long)hcpu;
1294 switch(action) { 1294 switch(action) {
1295 case CPU_UP_PREPARE: 1295 case CPU_UP_PREPARE:
1296 case CPU_UP_PREPARE_FROZEN:
1296 if (init_timers_cpu(cpu) < 0) 1297 if (init_timers_cpu(cpu) < 0)
1297 return NOTIFY_BAD; 1298 return NOTIFY_BAD;
1298 break; 1299 break;
1299#ifdef CONFIG_HOTPLUG_CPU 1300#ifdef CONFIG_HOTPLUG_CPU
1300 case CPU_DEAD: 1301 case CPU_DEAD:
1302 case CPU_DEAD_FROZEN:
1301 migrate_timers(cpu); 1303 migrate_timers(cpu);
1302 break; 1304 break;
1303#endif 1305#endif
@@ -1497,6 +1499,8 @@ unregister_time_interpolator(struct time_interpolator *ti)
1497 prev = &curr->next; 1499 prev = &curr->next;
1498 } 1500 }
1499 1501
1502 clocksource_resume();
1503
1500 write_seqlock_irqsave(&xtime_lock, flags); 1504 write_seqlock_irqsave(&xtime_lock, flags);
1501 if (ti == time_interpolator) { 1505 if (ti == time_interpolator) {
1502 /* we lost the best time-interpolator: */ 1506 /* we lost the best time-interpolator: */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6fa5e63085d..fb56fedd5c02 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -36,30 +36,20 @@
36/* 36/*
37 * The per-CPU workqueue (if single thread, we always use the first 37 * The per-CPU workqueue (if single thread, we always use the first
38 * possible cpu). 38 * possible cpu).
39 *
40 * The sequence counters are for flush_scheduled_work(). It wants to wait
41 * until all currently-scheduled works are completed, but it doesn't
42 * want to be livelocked by new, incoming ones. So it waits until
43 * remove_sequence is >= the insert_sequence which pertained when
44 * flush_scheduled_work() was called.
45 */ 39 */
46struct cpu_workqueue_struct { 40struct cpu_workqueue_struct {
47 41
48 spinlock_t lock; 42 spinlock_t lock;
49 43
50 long remove_sequence; /* Least-recently added (next to run) */
51 long insert_sequence; /* Next to add */
52
53 struct list_head worklist; 44 struct list_head worklist;
54 wait_queue_head_t more_work; 45 wait_queue_head_t more_work;
55 wait_queue_head_t work_done; 46 struct work_struct *current_work;
56 47
57 struct workqueue_struct *wq; 48 struct workqueue_struct *wq;
58 struct task_struct *thread; 49 struct task_struct *thread;
50 int should_stop;
59 51
60 int run_depth; /* Detect run_workqueue() recursion depth */ 52 int run_depth; /* Detect run_workqueue() recursion depth */
61
62 int freezeable; /* Freeze the thread during suspend */
63} ____cacheline_aligned; 53} ____cacheline_aligned;
64 54
65/* 55/*
@@ -68,8 +58,10 @@ struct cpu_workqueue_struct {
68 */ 58 */
69struct workqueue_struct { 59struct workqueue_struct {
70 struct cpu_workqueue_struct *cpu_wq; 60 struct cpu_workqueue_struct *cpu_wq;
61 struct list_head list;
71 const char *name; 62 const char *name;
72 struct list_head list; /* Empty if single thread */ 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */
73}; 65};
74 66
75/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 67/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
@@ -77,106 +69,68 @@ struct workqueue_struct {
77static DEFINE_MUTEX(workqueue_mutex); 69static DEFINE_MUTEX(workqueue_mutex);
78static LIST_HEAD(workqueues); 70static LIST_HEAD(workqueues);
79 71
80static int singlethread_cpu; 72static int singlethread_cpu __read_mostly;
73static cpumask_t cpu_singlethread_map __read_mostly;
74/* optimization, we could use cpu_possible_map */
75static cpumask_t cpu_populated_map __read_mostly;
81 76
82/* If it's single threaded, it isn't in the list of workqueues. */ 77/* If it's single threaded, it isn't in the list of workqueues. */
83static inline int is_single_threaded(struct workqueue_struct *wq) 78static inline int is_single_threaded(struct workqueue_struct *wq)
84{ 79{
85 return list_empty(&wq->list); 80 return wq->singlethread;
81}
82
83static const cpumask_t *wq_cpu_map(struct workqueue_struct *wq)
84{
85 return is_single_threaded(wq)
86 ? &cpu_singlethread_map : &cpu_populated_map;
87}
88
89static
90struct cpu_workqueue_struct *wq_per_cpu(struct workqueue_struct *wq, int cpu)
91{
92 if (unlikely(is_single_threaded(wq)))
93 cpu = singlethread_cpu;
94 return per_cpu_ptr(wq->cpu_wq, cpu);
86} 95}
87 96
88/* 97/*
89 * Set the workqueue on which a work item is to be run 98 * Set the workqueue on which a work item is to be run
90 * - Must *only* be called if the pending flag is set 99 * - Must *only* be called if the pending flag is set
91 */ 100 */
92static inline void set_wq_data(struct work_struct *work, void *wq) 101static inline void set_wq_data(struct work_struct *work,
102 struct cpu_workqueue_struct *cwq)
93{ 103{
94 unsigned long new; 104 unsigned long new;
95 105
96 BUG_ON(!work_pending(work)); 106 BUG_ON(!work_pending(work));
97 107
98 new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); 108 new = (unsigned long) cwq | (1UL << WORK_STRUCT_PENDING);
99 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); 109 new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work);
100 atomic_long_set(&work->data, new); 110 atomic_long_set(&work->data, new);
101} 111}
102 112
103static inline void *get_wq_data(struct work_struct *work) 113static inline
114struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
104{ 115{
105 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 116 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
106} 117}
107 118
108static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) 119static void insert_work(struct cpu_workqueue_struct *cwq,
120 struct work_struct *work, int tail)
109{ 121{
110 int ret = 0; 122 set_wq_data(work, cwq);
111 unsigned long flags;
112
113 spin_lock_irqsave(&cwq->lock, flags);
114 /* 123 /*
115 * We need to re-validate the work info after we've gotten 124 * Ensure that we get the right work->data if we see the
116 * the cpu_workqueue lock. We can run the work now iff: 125 * result of list_add() below, see try_to_grab_pending().
117 *
118 * - the wq_data still matches the cpu_workqueue_struct
119 * - AND the work is still marked pending
120 * - AND the work is still on a list (which will be this
121 * workqueue_struct list)
122 *
123 * All these conditions are important, because we
124 * need to protect against the work being run right
125 * now on another CPU (all but the last one might be
126 * true if it's currently running and has not been
127 * released yet, for example).
128 */ 126 */
129 if (get_wq_data(work) == cwq 127 smp_wmb();
130 && work_pending(work) 128 if (tail)
131 && !list_empty(&work->entry)) { 129 list_add_tail(&work->entry, &cwq->worklist);
132 work_func_t f = work->func; 130 else
133 list_del_init(&work->entry); 131 list_add(&work->entry, &cwq->worklist);
134 spin_unlock_irqrestore(&cwq->lock, flags); 132 wake_up(&cwq->more_work);
135
136 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work)))
137 work_release(work);
138 f(work);
139
140 spin_lock_irqsave(&cwq->lock, flags);
141 cwq->remove_sequence++;
142 wake_up(&cwq->work_done);
143 ret = 1;
144 }
145 spin_unlock_irqrestore(&cwq->lock, flags);
146 return ret;
147}
148
149/**
150 * run_scheduled_work - run scheduled work synchronously
151 * @work: work to run
152 *
153 * This checks if the work was pending, and runs it
154 * synchronously if so. It returns a boolean to indicate
155 * whether it had any scheduled work to run or not.
156 *
157 * NOTE! This _only_ works for normal work_structs. You
158 * CANNOT use this for delayed work, because the wq data
159 * for delayed work will not point properly to the per-
160 * CPU workqueue struct, but will change!
161 */
162int fastcall run_scheduled_work(struct work_struct *work)
163{
164 for (;;) {
165 struct cpu_workqueue_struct *cwq;
166
167 if (!work_pending(work))
168 return 0;
169 if (list_empty(&work->entry))
170 return 0;
171 /* NOTE! This depends intimately on __queue_work! */
172 cwq = get_wq_data(work);
173 if (!cwq)
174 return 0;
175 if (__run_work(cwq, work))
176 return 1;
177 }
178} 133}
179EXPORT_SYMBOL(run_scheduled_work);
180 134
181/* Preempt must be disabled. */ 135/* Preempt must be disabled. */
182static void __queue_work(struct cpu_workqueue_struct *cwq, 136static void __queue_work(struct cpu_workqueue_struct *cwq,
@@ -185,10 +139,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
185 unsigned long flags; 139 unsigned long flags;
186 140
187 spin_lock_irqsave(&cwq->lock, flags); 141 spin_lock_irqsave(&cwq->lock, flags);
188 set_wq_data(work, cwq); 142 insert_work(cwq, work, 1);
189 list_add_tail(&work->entry, &cwq->worklist);
190 cwq->insert_sequence++;
191 wake_up(&cwq->more_work);
192 spin_unlock_irqrestore(&cwq->lock, flags); 143 spin_unlock_irqrestore(&cwq->lock, flags);
193} 144}
194 145
@@ -204,16 +155,14 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
204 */ 155 */
205int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) 156int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
206{ 157{
207 int ret = 0, cpu = get_cpu(); 158 int ret = 0;
208 159
209 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 160 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
210 if (unlikely(is_single_threaded(wq)))
211 cpu = singlethread_cpu;
212 BUG_ON(!list_empty(&work->entry)); 161 BUG_ON(!list_empty(&work->entry));
213 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 162 __queue_work(wq_per_cpu(wq, get_cpu()), work);
163 put_cpu();
214 ret = 1; 164 ret = 1;
215 } 165 }
216 put_cpu();
217 return ret; 166 return ret;
218} 167}
219EXPORT_SYMBOL_GPL(queue_work); 168EXPORT_SYMBOL_GPL(queue_work);
@@ -221,13 +170,10 @@ EXPORT_SYMBOL_GPL(queue_work);
221void delayed_work_timer_fn(unsigned long __data) 170void delayed_work_timer_fn(unsigned long __data)
222{ 171{
223 struct delayed_work *dwork = (struct delayed_work *)__data; 172 struct delayed_work *dwork = (struct delayed_work *)__data;
224 struct workqueue_struct *wq = get_wq_data(&dwork->work); 173 struct cpu_workqueue_struct *cwq = get_wq_data(&dwork->work);
225 int cpu = smp_processor_id(); 174 struct workqueue_struct *wq = cwq->wq;
226 175
227 if (unlikely(is_single_threaded(wq))) 176 __queue_work(wq_per_cpu(wq, smp_processor_id()), &dwork->work);
228 cpu = singlethread_cpu;
229
230 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work);
231} 177}
232 178
233/** 179/**
@@ -241,27 +187,11 @@ void delayed_work_timer_fn(unsigned long __data)
241int fastcall queue_delayed_work(struct workqueue_struct *wq, 187int fastcall queue_delayed_work(struct workqueue_struct *wq,
242 struct delayed_work *dwork, unsigned long delay) 188 struct delayed_work *dwork, unsigned long delay)
243{ 189{
244 int ret = 0; 190 timer_stats_timer_set_start_info(&dwork->timer);
245 struct timer_list *timer = &dwork->timer;
246 struct work_struct *work = &dwork->work;
247
248 timer_stats_timer_set_start_info(timer);
249 if (delay == 0) 191 if (delay == 0)
250 return queue_work(wq, work); 192 return queue_work(wq, &dwork->work);
251
252 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
253 BUG_ON(timer_pending(timer));
254 BUG_ON(!list_empty(&work->entry));
255 193
256 /* This stores wq for the moment, for the timer_fn */ 194 return queue_delayed_work_on(-1, wq, dwork, delay);
257 set_wq_data(work, wq);
258 timer->expires = jiffies + delay;
259 timer->data = (unsigned long)dwork;
260 timer->function = delayed_work_timer_fn;
261 add_timer(timer);
262 ret = 1;
263 }
264 return ret;
265} 195}
266EXPORT_SYMBOL_GPL(queue_delayed_work); 196EXPORT_SYMBOL_GPL(queue_delayed_work);
267 197
@@ -285,12 +215,16 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
285 BUG_ON(timer_pending(timer)); 215 BUG_ON(timer_pending(timer));
286 BUG_ON(!list_empty(&work->entry)); 216 BUG_ON(!list_empty(&work->entry));
287 217
288 /* This stores wq for the moment, for the timer_fn */ 218 /* This stores cwq for the moment, for the timer_fn */
289 set_wq_data(work, wq); 219 set_wq_data(work, wq_per_cpu(wq, raw_smp_processor_id()));
290 timer->expires = jiffies + delay; 220 timer->expires = jiffies + delay;
291 timer->data = (unsigned long)dwork; 221 timer->data = (unsigned long)dwork;
292 timer->function = delayed_work_timer_fn; 222 timer->function = delayed_work_timer_fn;
293 add_timer_on(timer, cpu); 223
224 if (unlikely(cpu >= 0))
225 add_timer_on(timer, cpu);
226 else
227 add_timer(timer);
294 ret = 1; 228 ret = 1;
295 } 229 }
296 return ret; 230 return ret;
@@ -299,13 +233,7 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
299 233
300static void run_workqueue(struct cpu_workqueue_struct *cwq) 234static void run_workqueue(struct cpu_workqueue_struct *cwq)
301{ 235{
302 unsigned long flags; 236 spin_lock_irq(&cwq->lock);
303
304 /*
305 * Keep taking off work from the queue until
306 * done.
307 */
308 spin_lock_irqsave(&cwq->lock, flags);
309 cwq->run_depth++; 237 cwq->run_depth++;
310 if (cwq->run_depth > 3) { 238 if (cwq->run_depth > 3) {
311 /* morton gets to eat his hat */ 239 /* morton gets to eat his hat */
@@ -318,12 +246,12 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
318 struct work_struct, entry); 246 struct work_struct, entry);
319 work_func_t f = work->func; 247 work_func_t f = work->func;
320 248
249 cwq->current_work = work;
321 list_del_init(cwq->worklist.next); 250 list_del_init(cwq->worklist.next);
322 spin_unlock_irqrestore(&cwq->lock, flags); 251 spin_unlock_irq(&cwq->lock);
323 252
324 BUG_ON(get_wq_data(work) != cwq); 253 BUG_ON(get_wq_data(work) != cwq);
325 if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) 254 work_clear_pending(work);
326 work_release(work);
327 f(work); 255 f(work);
328 256
329 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 257 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
@@ -337,63 +265,81 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
337 dump_stack(); 265 dump_stack();
338 } 266 }
339 267
340 spin_lock_irqsave(&cwq->lock, flags); 268 spin_lock_irq(&cwq->lock);
341 cwq->remove_sequence++; 269 cwq->current_work = NULL;
342 wake_up(&cwq->work_done);
343 } 270 }
344 cwq->run_depth--; 271 cwq->run_depth--;
345 spin_unlock_irqrestore(&cwq->lock, flags); 272 spin_unlock_irq(&cwq->lock);
273}
274
275/*
276 * NOTE: the caller must not touch *cwq if this func returns true
277 */
278static int cwq_should_stop(struct cpu_workqueue_struct *cwq)
279{
280 int should_stop = cwq->should_stop;
281
282 if (unlikely(should_stop)) {
283 spin_lock_irq(&cwq->lock);
284 should_stop = cwq->should_stop && list_empty(&cwq->worklist);
285 if (should_stop)
286 cwq->thread = NULL;
287 spin_unlock_irq(&cwq->lock);
288 }
289
290 return should_stop;
346} 291}
347 292
348static int worker_thread(void *__cwq) 293static int worker_thread(void *__cwq)
349{ 294{
350 struct cpu_workqueue_struct *cwq = __cwq; 295 struct cpu_workqueue_struct *cwq = __cwq;
351 DECLARE_WAITQUEUE(wait, current); 296 DEFINE_WAIT(wait);
352 struct k_sigaction sa;
353 sigset_t blocked;
354 297
355 if (!cwq->freezeable) 298 if (!cwq->wq->freezeable)
356 current->flags |= PF_NOFREEZE; 299 current->flags |= PF_NOFREEZE;
357 300
358 set_user_nice(current, -5); 301 set_user_nice(current, -5);
359 302
360 /* Block and flush all signals */ 303 for (;;) {
361 sigfillset(&blocked); 304 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
362 sigprocmask(SIG_BLOCK, &blocked, NULL); 305 if (!freezing(current) && !cwq->should_stop
363 flush_signals(current); 306 && list_empty(&cwq->worklist))
364 307 schedule();
365 /* 308 finish_wait(&cwq->more_work, &wait);
366 * We inherited MPOL_INTERLEAVE from the booting kernel.
367 * Set MPOL_DEFAULT to insure node local allocations.
368 */
369 numa_default_policy();
370
371 /* SIG_IGN makes children autoreap: see do_notify_parent(). */
372 sa.sa.sa_handler = SIG_IGN;
373 sa.sa.sa_flags = 0;
374 siginitset(&sa.sa.sa_mask, sigmask(SIGCHLD));
375 do_sigaction(SIGCHLD, &sa, (struct k_sigaction *)0);
376 309
377 set_current_state(TASK_INTERRUPTIBLE); 310 try_to_freeze();
378 while (!kthread_should_stop()) {
379 if (cwq->freezeable)
380 try_to_freeze();
381 311
382 add_wait_queue(&cwq->more_work, &wait); 312 if (cwq_should_stop(cwq))
383 if (list_empty(&cwq->worklist)) 313 break;
384 schedule();
385 else
386 __set_current_state(TASK_RUNNING);
387 remove_wait_queue(&cwq->more_work, &wait);
388 314
389 if (!list_empty(&cwq->worklist)) 315 run_workqueue(cwq);
390 run_workqueue(cwq);
391 set_current_state(TASK_INTERRUPTIBLE);
392 } 316 }
393 __set_current_state(TASK_RUNNING); 317
394 return 0; 318 return 0;
395} 319}
396 320
321struct wq_barrier {
322 struct work_struct work;
323 struct completion done;
324};
325
326static void wq_barrier_func(struct work_struct *work)
327{
328 struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
329 complete(&barr->done);
330}
331
332static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
333 struct wq_barrier *barr, int tail)
334{
335 INIT_WORK(&barr->work, wq_barrier_func);
336 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
337
338 init_completion(&barr->done);
339
340 insert_work(cwq, &barr->work, tail);
341}
342
397static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 343static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
398{ 344{
399 if (cwq->thread == current) { 345 if (cwq->thread == current) {
@@ -403,21 +349,18 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
403 */ 349 */
404 run_workqueue(cwq); 350 run_workqueue(cwq);
405 } else { 351 } else {
406 DEFINE_WAIT(wait); 352 struct wq_barrier barr;
407 long sequence_needed; 353 int active = 0;
408 354
409 spin_lock_irq(&cwq->lock); 355 spin_lock_irq(&cwq->lock);
410 sequence_needed = cwq->insert_sequence; 356 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
411 357 insert_wq_barrier(cwq, &barr, 1);
412 while (sequence_needed - cwq->remove_sequence > 0) { 358 active = 1;
413 prepare_to_wait(&cwq->work_done, &wait,
414 TASK_UNINTERRUPTIBLE);
415 spin_unlock_irq(&cwq->lock);
416 schedule();
417 spin_lock_irq(&cwq->lock);
418 } 359 }
419 finish_wait(&cwq->work_done, &wait);
420 spin_unlock_irq(&cwq->lock); 360 spin_unlock_irq(&cwq->lock);
361
362 if (active)
363 wait_for_completion(&barr.done);
421 } 364 }
422} 365}
423 366
@@ -428,151 +371,145 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
428 * Forces execution of the workqueue and blocks until its completion. 371 * Forces execution of the workqueue and blocks until its completion.
429 * This is typically used in driver shutdown handlers. 372 * This is typically used in driver shutdown handlers.
430 * 373 *
431 * This function will sample each workqueue's current insert_sequence number and 374 * We sleep until all works which were queued on entry have been handled,
432 * will sleep until the head sequence is greater than or equal to that. This 375 * but we are not livelocked by new incoming ones.
433 * means that we sleep until all works which were queued on entry have been
434 * handled, but we are not livelocked by new incoming ones.
435 * 376 *
436 * This function used to run the workqueues itself. Now we just wait for the 377 * This function used to run the workqueues itself. Now we just wait for the
437 * helper threads to do it. 378 * helper threads to do it.
438 */ 379 */
439void fastcall flush_workqueue(struct workqueue_struct *wq) 380void fastcall flush_workqueue(struct workqueue_struct *wq)
440{ 381{
382 const cpumask_t *cpu_map = wq_cpu_map(wq);
383 int cpu;
384
441 might_sleep(); 385 might_sleep();
386 for_each_cpu_mask(cpu, *cpu_map)
387 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
388}
389EXPORT_SYMBOL_GPL(flush_workqueue);
442 390
443 if (is_single_threaded(wq)) { 391/*
444 /* Always use first cpu's area. */ 392 * Upon a successful return, the caller "owns" WORK_STRUCT_PENDING bit,
445 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); 393 * so this work can't be re-armed in any way.
446 } else { 394 */
447 int cpu; 395static int try_to_grab_pending(struct work_struct *work)
396{
397 struct cpu_workqueue_struct *cwq;
398 int ret = 0;
448 399
449 mutex_lock(&workqueue_mutex); 400 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work)))
450 for_each_online_cpu(cpu) 401 return 1;
451 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 402
452 mutex_unlock(&workqueue_mutex); 403 /*
404 * The queueing is in progress, or it is already queued. Try to
405 * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
406 */
407
408 cwq = get_wq_data(work);
409 if (!cwq)
410 return ret;
411
412 spin_lock_irq(&cwq->lock);
413 if (!list_empty(&work->entry)) {
414 /*
415 * This work is queued, but perhaps we locked the wrong cwq.
416 * In that case we must see the new value after rmb(), see
417 * insert_work()->wmb().
418 */
419 smp_rmb();
420 if (cwq == get_wq_data(work)) {
421 list_del_init(&work->entry);
422 ret = 1;
423 }
453 } 424 }
425 spin_unlock_irq(&cwq->lock);
426
427 return ret;
454} 428}
455EXPORT_SYMBOL_GPL(flush_workqueue);
456 429
457static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 430static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
458 int cpu, int freezeable) 431 struct work_struct *work)
459{ 432{
460 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 433 struct wq_barrier barr;
461 struct task_struct *p; 434 int running = 0;
462 435
463 spin_lock_init(&cwq->lock); 436 spin_lock_irq(&cwq->lock);
464 cwq->wq = wq; 437 if (unlikely(cwq->current_work == work)) {
465 cwq->thread = NULL; 438 insert_wq_barrier(cwq, &barr, 0);
466 cwq->insert_sequence = 0; 439 running = 1;
467 cwq->remove_sequence = 0; 440 }
468 cwq->freezeable = freezeable; 441 spin_unlock_irq(&cwq->lock);
469 INIT_LIST_HEAD(&cwq->worklist);
470 init_waitqueue_head(&cwq->more_work);
471 init_waitqueue_head(&cwq->work_done);
472 442
473 if (is_single_threaded(wq)) 443 if (unlikely(running))
474 p = kthread_create(worker_thread, cwq, "%s", wq->name); 444 wait_for_completion(&barr.done);
475 else
476 p = kthread_create(worker_thread, cwq, "%s/%d", wq->name, cpu);
477 if (IS_ERR(p))
478 return NULL;
479 cwq->thread = p;
480 return p;
481} 445}
482 446
483struct workqueue_struct *__create_workqueue(const char *name, 447static void wait_on_work(struct work_struct *work)
484 int singlethread, int freezeable)
485{ 448{
486 int cpu, destroy = 0; 449 struct cpu_workqueue_struct *cwq;
487 struct workqueue_struct *wq; 450 struct workqueue_struct *wq;
488 struct task_struct *p; 451 const cpumask_t *cpu_map;
452 int cpu;
489 453
490 wq = kzalloc(sizeof(*wq), GFP_KERNEL); 454 might_sleep();
491 if (!wq)
492 return NULL;
493 455
494 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); 456 cwq = get_wq_data(work);
495 if (!wq->cpu_wq) { 457 if (!cwq)
496 kfree(wq); 458 return;
497 return NULL;
498 }
499 459
500 wq->name = name; 460 wq = cwq->wq;
501 mutex_lock(&workqueue_mutex); 461 cpu_map = wq_cpu_map(wq);
502 if (singlethread) {
503 INIT_LIST_HEAD(&wq->list);
504 p = create_workqueue_thread(wq, singlethread_cpu, freezeable);
505 if (!p)
506 destroy = 1;
507 else
508 wake_up_process(p);
509 } else {
510 list_add(&wq->list, &workqueues);
511 for_each_online_cpu(cpu) {
512 p = create_workqueue_thread(wq, cpu, freezeable);
513 if (p) {
514 kthread_bind(p, cpu);
515 wake_up_process(p);
516 } else
517 destroy = 1;
518 }
519 }
520 mutex_unlock(&workqueue_mutex);
521 462
522 /* 463 for_each_cpu_mask(cpu, *cpu_map)
523 * Was there any error during startup? If yes then clean up: 464 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
524 */
525 if (destroy) {
526 destroy_workqueue(wq);
527 wq = NULL;
528 }
529 return wq;
530} 465}
531EXPORT_SYMBOL_GPL(__create_workqueue);
532 466
533static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) 467/**
468 * cancel_work_sync - block until a work_struct's callback has terminated
469 * @work: the work which is to be flushed
470 *
471 * cancel_work_sync() will cancel the work if it is queued. If the work's
472 * callback appears to be running, cancel_work_sync() will block until it
473 * has completed.
474 *
475 * It is possible to use this function if the work re-queues itself. It can
476 * cancel the work even if it migrates to another workqueue, however in that
477 * case it only guarantees that work->func() has completed on the last queued
478 * workqueue.
479 *
480 * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
481 * pending, otherwise it goes into a busy-wait loop until the timer expires.
482 *
483 * The caller must ensure that workqueue_struct on which this work was last
484 * queued can't be destroyed before this function returns.
485 */
486void cancel_work_sync(struct work_struct *work)
534{ 487{
535 struct cpu_workqueue_struct *cwq; 488 while (!try_to_grab_pending(work))
536 unsigned long flags; 489 cpu_relax();
537 struct task_struct *p; 490 wait_on_work(work);
538 491 work_clear_pending(work);
539 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
540 spin_lock_irqsave(&cwq->lock, flags);
541 p = cwq->thread;
542 cwq->thread = NULL;
543 spin_unlock_irqrestore(&cwq->lock, flags);
544 if (p)
545 kthread_stop(p);
546} 492}
493EXPORT_SYMBOL_GPL(cancel_work_sync);
547 494
548/** 495/**
549 * destroy_workqueue - safely terminate a workqueue 496 * cancel_rearming_delayed_work - reliably kill off a delayed work.
550 * @wq: target workqueue 497 * @dwork: the delayed work struct
551 * 498 *
552 * Safely destroy a workqueue. All work currently pending will be done first. 499 * It is possible to use this function if @dwork rearms itself via queue_work()
500 * or queue_delayed_work(). See also the comment for cancel_work_sync().
553 */ 501 */
554void destroy_workqueue(struct workqueue_struct *wq) 502void cancel_rearming_delayed_work(struct delayed_work *dwork)
555{ 503{
556 int cpu; 504 while (!del_timer(&dwork->timer) &&
557 505 !try_to_grab_pending(&dwork->work))
558 flush_workqueue(wq); 506 cpu_relax();
559 507 wait_on_work(&dwork->work);
560 /* We don't need the distraction of CPUs appearing and vanishing. */ 508 work_clear_pending(&dwork->work);
561 mutex_lock(&workqueue_mutex);
562 if (is_single_threaded(wq))
563 cleanup_workqueue_thread(wq, singlethread_cpu);
564 else {
565 for_each_online_cpu(cpu)
566 cleanup_workqueue_thread(wq, cpu);
567 list_del(&wq->list);
568 }
569 mutex_unlock(&workqueue_mutex);
570 free_percpu(wq->cpu_wq);
571 kfree(wq);
572} 509}
573EXPORT_SYMBOL_GPL(destroy_workqueue); 510EXPORT_SYMBOL(cancel_rearming_delayed_work);
574 511
575static struct workqueue_struct *keventd_wq; 512static struct workqueue_struct *keventd_wq __read_mostly;
576 513
577/** 514/**
578 * schedule_work - put work task in global workqueue 515 * schedule_work - put work task in global workqueue
@@ -638,7 +575,7 @@ int schedule_on_each_cpu(work_func_t func)
638 if (!works) 575 if (!works)
639 return -ENOMEM; 576 return -ENOMEM;
640 577
641 mutex_lock(&workqueue_mutex); 578 preempt_disable(); /* CPU hotplug */
642 for_each_online_cpu(cpu) { 579 for_each_online_cpu(cpu) {
643 struct work_struct *work = per_cpu_ptr(works, cpu); 580 struct work_struct *work = per_cpu_ptr(works, cpu);
644 581
@@ -646,7 +583,7 @@ int schedule_on_each_cpu(work_func_t func)
646 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 583 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
647 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 584 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
648 } 585 }
649 mutex_unlock(&workqueue_mutex); 586 preempt_enable();
650 flush_workqueue(keventd_wq); 587 flush_workqueue(keventd_wq);
651 free_percpu(works); 588 free_percpu(works);
652 return 0; 589 return 0;
@@ -659,29 +596,6 @@ void flush_scheduled_work(void)
659EXPORT_SYMBOL(flush_scheduled_work); 596EXPORT_SYMBOL(flush_scheduled_work);
660 597
661/** 598/**
662 * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work.
663 * @wq: the controlling workqueue structure
664 * @dwork: the delayed work struct
665 */
666void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
667 struct delayed_work *dwork)
668{
669 while (!cancel_delayed_work(dwork))
670 flush_workqueue(wq);
671}
672EXPORT_SYMBOL(cancel_rearming_delayed_workqueue);
673
674/**
675 * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work.
676 * @dwork: the delayed work struct
677 */
678void cancel_rearming_delayed_work(struct delayed_work *dwork)
679{
680 cancel_rearming_delayed_workqueue(keventd_wq, dwork);
681}
682EXPORT_SYMBOL(cancel_rearming_delayed_work);
683
684/**
685 * execute_in_process_context - reliably execute the routine with user context 599 * execute_in_process_context - reliably execute the routine with user context
686 * @fn: the function to execute 600 * @fn: the function to execute
687 * @ew: guaranteed storage for the execute work structure (must 601 * @ew: guaranteed storage for the execute work structure (must
@@ -728,94 +642,209 @@ int current_is_keventd(void)
728 642
729} 643}
730 644
731/* Take the work from this (downed) CPU. */ 645static struct cpu_workqueue_struct *
732static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 646init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
733{ 647{
734 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 648 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
735 struct list_head list;
736 struct work_struct *work;
737 649
738 spin_lock_irq(&cwq->lock); 650 cwq->wq = wq;
739 list_replace_init(&cwq->worklist, &list); 651 spin_lock_init(&cwq->lock);
652 INIT_LIST_HEAD(&cwq->worklist);
653 init_waitqueue_head(&cwq->more_work);
654
655 return cwq;
656}
657
658static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
659{
660 struct workqueue_struct *wq = cwq->wq;
661 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
662 struct task_struct *p;
663
664 p = kthread_create(worker_thread, cwq, fmt, wq->name, cpu);
665 /*
666 * Nobody can add the work_struct to this cwq,
667 * if (caller is __create_workqueue)
668 * nobody should see this wq
669 * else // caller is CPU_UP_PREPARE
670 * cpu is not on cpu_online_map
671 * so we can abort safely.
672 */
673 if (IS_ERR(p))
674 return PTR_ERR(p);
675
676 cwq->thread = p;
677 cwq->should_stop = 0;
678
679 return 0;
680}
681
682static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
683{
684 struct task_struct *p = cwq->thread;
740 685
741 while (!list_empty(&list)) { 686 if (p != NULL) {
742 printk("Taking work for %s\n", wq->name); 687 if (cpu >= 0)
743 work = list_entry(list.next,struct work_struct,entry); 688 kthread_bind(p, cpu);
744 list_del(&work->entry); 689 wake_up_process(p);
745 __queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
746 } 690 }
747 spin_unlock_irq(&cwq->lock);
748} 691}
749 692
750/* We're holding the cpucontrol mutex here */ 693struct workqueue_struct *__create_workqueue(const char *name,
751static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, 694 int singlethread, int freezeable)
752 unsigned long action,
753 void *hcpu)
754{ 695{
755 unsigned int hotcpu = (unsigned long)hcpu;
756 struct workqueue_struct *wq; 696 struct workqueue_struct *wq;
697 struct cpu_workqueue_struct *cwq;
698 int err = 0, cpu;
757 699
758 switch (action) { 700 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
759 case CPU_UP_PREPARE: 701 if (!wq)
760 mutex_lock(&workqueue_mutex); 702 return NULL;
761 /* Create a new workqueue thread for it. */
762 list_for_each_entry(wq, &workqueues, list) {
763 if (!create_workqueue_thread(wq, hotcpu, 0)) {
764 printk("workqueue for %i failed\n", hotcpu);
765 return NOTIFY_BAD;
766 }
767 }
768 break;
769 703
770 case CPU_ONLINE: 704 wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
771 /* Kick off worker threads. */ 705 if (!wq->cpu_wq) {
772 list_for_each_entry(wq, &workqueues, list) { 706 kfree(wq);
773 struct cpu_workqueue_struct *cwq; 707 return NULL;
708 }
774 709
775 cwq = per_cpu_ptr(wq->cpu_wq, hotcpu); 710 wq->name = name;
776 kthread_bind(cwq->thread, hotcpu); 711 wq->singlethread = singlethread;
777 wake_up_process(cwq->thread); 712 wq->freezeable = freezeable;
778 } 713 INIT_LIST_HEAD(&wq->list);
779 mutex_unlock(&workqueue_mutex);
780 break;
781 714
782 case CPU_UP_CANCELED: 715 if (singlethread) {
783 list_for_each_entry(wq, &workqueues, list) { 716 cwq = init_cpu_workqueue(wq, singlethread_cpu);
784 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) 717 err = create_workqueue_thread(cwq, singlethread_cpu);
718 start_workqueue_thread(cwq, -1);
719 } else {
720 mutex_lock(&workqueue_mutex);
721 list_add(&wq->list, &workqueues);
722
723 for_each_possible_cpu(cpu) {
724 cwq = init_cpu_workqueue(wq, cpu);
725 if (err || !cpu_online(cpu))
785 continue; 726 continue;
786 /* Unbind so it can run. */ 727 err = create_workqueue_thread(cwq, cpu);
787 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 728 start_workqueue_thread(cwq, cpu);
788 any_online_cpu(cpu_online_map));
789 cleanup_workqueue_thread(wq, hotcpu);
790 } 729 }
791 mutex_unlock(&workqueue_mutex); 730 mutex_unlock(&workqueue_mutex);
792 break; 731 }
732
733 if (err) {
734 destroy_workqueue(wq);
735 wq = NULL;
736 }
737 return wq;
738}
739EXPORT_SYMBOL_GPL(__create_workqueue);
740
741static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
742{
743 struct wq_barrier barr;
744 int alive = 0;
745
746 spin_lock_irq(&cwq->lock);
747 if (cwq->thread != NULL) {
748 insert_wq_barrier(cwq, &barr, 1);
749 cwq->should_stop = 1;
750 alive = 1;
751 }
752 spin_unlock_irq(&cwq->lock);
753
754 if (alive) {
755 wait_for_completion(&barr.done);
793 756
794 case CPU_DOWN_PREPARE: 757 while (unlikely(cwq->thread != NULL))
758 cpu_relax();
759 /*
760 * Wait until cwq->thread unlocks cwq->lock,
761 * it won't touch *cwq after that.
762 */
763 smp_rmb();
764 spin_unlock_wait(&cwq->lock);
765 }
766}
767
768/**
769 * destroy_workqueue - safely terminate a workqueue
770 * @wq: target workqueue
771 *
772 * Safely destroy a workqueue. All work currently pending will be done first.
773 */
774void destroy_workqueue(struct workqueue_struct *wq)
775{
776 const cpumask_t *cpu_map = wq_cpu_map(wq);
777 struct cpu_workqueue_struct *cwq;
778 int cpu;
779
780 mutex_lock(&workqueue_mutex);
781 list_del(&wq->list);
782 mutex_unlock(&workqueue_mutex);
783
784 for_each_cpu_mask(cpu, *cpu_map) {
785 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
786 cleanup_workqueue_thread(cwq, cpu);
787 }
788
789 free_percpu(wq->cpu_wq);
790 kfree(wq);
791}
792EXPORT_SYMBOL_GPL(destroy_workqueue);
793
794static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
795 unsigned long action,
796 void *hcpu)
797{
798 unsigned int cpu = (unsigned long)hcpu;
799 struct cpu_workqueue_struct *cwq;
800 struct workqueue_struct *wq;
801
802 action &= ~CPU_TASKS_FROZEN;
803
804 switch (action) {
805 case CPU_LOCK_ACQUIRE:
795 mutex_lock(&workqueue_mutex); 806 mutex_lock(&workqueue_mutex);
796 break; 807 return NOTIFY_OK;
797 808
798 case CPU_DOWN_FAILED: 809 case CPU_LOCK_RELEASE:
799 mutex_unlock(&workqueue_mutex); 810 mutex_unlock(&workqueue_mutex);
800 break; 811 return NOTIFY_OK;
801 812
802 case CPU_DEAD: 813 case CPU_UP_PREPARE:
803 list_for_each_entry(wq, &workqueues, list) 814 cpu_set(cpu, cpu_populated_map);
804 cleanup_workqueue_thread(wq, hotcpu); 815 }
805 list_for_each_entry(wq, &workqueues, list) 816
806 take_over_work(wq, hotcpu); 817 list_for_each_entry(wq, &workqueues, list) {
807 mutex_unlock(&workqueue_mutex); 818 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
808 break; 819
820 switch (action) {
821 case CPU_UP_PREPARE:
822 if (!create_workqueue_thread(cwq, cpu))
823 break;
824 printk(KERN_ERR "workqueue for %i failed\n", cpu);
825 return NOTIFY_BAD;
826
827 case CPU_ONLINE:
828 start_workqueue_thread(cwq, cpu);
829 break;
830
831 case CPU_UP_CANCELED:
832 start_workqueue_thread(cwq, -1);
833 case CPU_DEAD:
834 cleanup_workqueue_thread(cwq, cpu);
835 break;
836 }
809 } 837 }
810 838
811 return NOTIFY_OK; 839 return NOTIFY_OK;
812} 840}
813 841
814void init_workqueues(void) 842void __init init_workqueues(void)
815{ 843{
844 cpu_populated_map = cpu_online_map;
816 singlethread_cpu = first_cpu(cpu_possible_map); 845 singlethread_cpu = first_cpu(cpu_possible_map);
846 cpu_singlethread_map = cpumask_of_cpu(singlethread_cpu);
817 hotcpu_notifier(workqueue_cpu_callback, 0); 847 hotcpu_notifier(workqueue_cpu_callback, 0);
818 keventd_wq = create_workqueue("events"); 848 keventd_wq = create_workqueue("events");
819 BUG_ON(!keventd_wq); 849 BUG_ON(!keventd_wq);
820} 850}
821