diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 18:20:36 -0400 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /kernel/futex.c |
Linux-2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 798 |
1 files changed, 798 insertions, 0 deletions
diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 00000000000..7b54a672d0a --- /dev/null +++ b/kernel/futex.c | |||
@@ -0,0 +1,798 @@ | |||
1 | /* | ||
2 | * Fast Userspace Mutexes (which I call "Futexes!"). | ||
3 | * (C) Rusty Russell, IBM 2002 | ||
4 | * | ||
5 | * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar | ||
6 | * (C) Copyright 2003 Red Hat Inc, All Rights Reserved | ||
7 | * | ||
8 | * Removed page pinning, fix privately mapped COW pages and other cleanups | ||
9 | * (C) Copyright 2003, 2004 Jamie Lokier | ||
10 | * | ||
11 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | ||
12 | * enough at me, Linus for the original (flawed) idea, Matthew | ||
13 | * Kirkwood for proof-of-concept implementation. | ||
14 | * | ||
15 | * "The futexes are also cursed." | ||
16 | * "But they come in a choice of three flavours!" | ||
17 | * | ||
18 | * This program is free software; you can redistribute it and/or modify | ||
19 | * it under the terms of the GNU General Public License as published by | ||
20 | * the Free Software Foundation; either version 2 of the License, or | ||
21 | * (at your option) any later version. | ||
22 | * | ||
23 | * This program is distributed in the hope that it will be useful, | ||
24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
26 | * GNU General Public License for more details. | ||
27 | * | ||
28 | * You should have received a copy of the GNU General Public License | ||
29 | * along with this program; if not, write to the Free Software | ||
30 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
31 | */ | ||
32 | #include <linux/slab.h> | ||
33 | #include <linux/poll.h> | ||
34 | #include <linux/fs.h> | ||
35 | #include <linux/file.h> | ||
36 | #include <linux/jhash.h> | ||
37 | #include <linux/init.h> | ||
38 | #include <linux/futex.h> | ||
39 | #include <linux/mount.h> | ||
40 | #include <linux/pagemap.h> | ||
41 | #include <linux/syscalls.h> | ||
42 | |||
43 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | ||
44 | |||
45 | /* | ||
46 | * Futexes are matched on equal values of this key. | ||
47 | * The key type depends on whether it's a shared or private mapping. | ||
48 | * Don't rearrange members without looking at hash_futex(). | ||
49 | * | ||
50 | * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. | ||
51 | * We set bit 0 to indicate if it's an inode-based key. | ||
52 | */ | ||
53 | union futex_key { | ||
54 | struct { | ||
55 | unsigned long pgoff; | ||
56 | struct inode *inode; | ||
57 | int offset; | ||
58 | } shared; | ||
59 | struct { | ||
60 | unsigned long uaddr; | ||
61 | struct mm_struct *mm; | ||
62 | int offset; | ||
63 | } private; | ||
64 | struct { | ||
65 | unsigned long word; | ||
66 | void *ptr; | ||
67 | int offset; | ||
68 | } both; | ||
69 | }; | ||
70 | |||
71 | /* | ||
72 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | ||
73 | * we can wake only the relevant ones (hashed queues may be shared). | ||
74 | * | ||
75 | * A futex_q has a woken state, just like tasks have TASK_RUNNING. | ||
76 | * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. | ||
77 | * The order of wakup is always to make the first condition true, then | ||
78 | * wake up q->waiters, then make the second condition true. | ||
79 | */ | ||
80 | struct futex_q { | ||
81 | struct list_head list; | ||
82 | wait_queue_head_t waiters; | ||
83 | |||
84 | /* Which hash list lock to use. */ | ||
85 | spinlock_t *lock_ptr; | ||
86 | |||
87 | /* Key which the futex is hashed on. */ | ||
88 | union futex_key key; | ||
89 | |||
90 | /* For fd, sigio sent using these. */ | ||
91 | int fd; | ||
92 | struct file *filp; | ||
93 | }; | ||
94 | |||
95 | /* | ||
96 | * Split the global futex_lock into every hash list lock. | ||
97 | */ | ||
98 | struct futex_hash_bucket { | ||
99 | spinlock_t lock; | ||
100 | struct list_head chain; | ||
101 | }; | ||
102 | |||
103 | static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; | ||
104 | |||
105 | /* Futex-fs vfsmount entry: */ | ||
106 | static struct vfsmount *futex_mnt; | ||
107 | |||
108 | /* | ||
109 | * We hash on the keys returned from get_futex_key (see below). | ||
110 | */ | ||
111 | static struct futex_hash_bucket *hash_futex(union futex_key *key) | ||
112 | { | ||
113 | u32 hash = jhash2((u32*)&key->both.word, | ||
114 | (sizeof(key->both.word)+sizeof(key->both.ptr))/4, | ||
115 | key->both.offset); | ||
116 | return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Return 1 if two futex_keys are equal, 0 otherwise. | ||
121 | */ | ||
122 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | ||
123 | { | ||
124 | return (key1->both.word == key2->both.word | ||
125 | && key1->both.ptr == key2->both.ptr | ||
126 | && key1->both.offset == key2->both.offset); | ||
127 | } | ||
128 | |||
129 | /* | ||
130 | * Get parameters which are the keys for a futex. | ||
131 | * | ||
132 | * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, | ||
133 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | ||
134 | * We can usually work out the index without swapping in the page. | ||
135 | * | ||
136 | * Returns: 0, or negative error code. | ||
137 | * The key words are stored in *key on success. | ||
138 | * | ||
139 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | ||
140 | */ | ||
141 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | ||
142 | { | ||
143 | struct mm_struct *mm = current->mm; | ||
144 | struct vm_area_struct *vma; | ||
145 | struct page *page; | ||
146 | int err; | ||
147 | |||
148 | /* | ||
149 | * The futex address must be "naturally" aligned. | ||
150 | */ | ||
151 | key->both.offset = uaddr % PAGE_SIZE; | ||
152 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | ||
153 | return -EINVAL; | ||
154 | uaddr -= key->both.offset; | ||
155 | |||
156 | /* | ||
157 | * The futex is hashed differently depending on whether | ||
158 | * it's in a shared or private mapping. So check vma first. | ||
159 | */ | ||
160 | vma = find_extend_vma(mm, uaddr); | ||
161 | if (unlikely(!vma)) | ||
162 | return -EFAULT; | ||
163 | |||
164 | /* | ||
165 | * Permissions. | ||
166 | */ | ||
167 | if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) | ||
168 | return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; | ||
169 | |||
170 | /* | ||
171 | * Private mappings are handled in a simple way. | ||
172 | * | ||
173 | * NOTE: When userspace waits on a MAP_SHARED mapping, even if | ||
174 | * it's a read-only handle, it's expected that futexes attach to | ||
175 | * the object not the particular process. Therefore we use | ||
176 | * VM_MAYSHARE here, not VM_SHARED which is restricted to shared | ||
177 | * mappings of _writable_ handles. | ||
178 | */ | ||
179 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | ||
180 | key->private.mm = mm; | ||
181 | key->private.uaddr = uaddr; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | /* | ||
186 | * Linear file mappings are also simple. | ||
187 | */ | ||
188 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | ||
189 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | ||
190 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | ||
191 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | ||
192 | + vma->vm_pgoff); | ||
193 | return 0; | ||
194 | } | ||
195 | |||
196 | /* | ||
197 | * We could walk the page table to read the non-linear | ||
198 | * pte, and get the page index without fetching the page | ||
199 | * from swap. But that's a lot of code to duplicate here | ||
200 | * for a rare case, so we simply fetch the page. | ||
201 | */ | ||
202 | |||
203 | /* | ||
204 | * Do a quick atomic lookup first - this is the fastpath. | ||
205 | */ | ||
206 | spin_lock(¤t->mm->page_table_lock); | ||
207 | page = follow_page(mm, uaddr, 0); | ||
208 | if (likely(page != NULL)) { | ||
209 | key->shared.pgoff = | ||
210 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
211 | spin_unlock(¤t->mm->page_table_lock); | ||
212 | return 0; | ||
213 | } | ||
214 | spin_unlock(¤t->mm->page_table_lock); | ||
215 | |||
216 | /* | ||
217 | * Do it the general way. | ||
218 | */ | ||
219 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | ||
220 | if (err >= 0) { | ||
221 | key->shared.pgoff = | ||
222 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
223 | put_page(page); | ||
224 | return 0; | ||
225 | } | ||
226 | return err; | ||
227 | } | ||
228 | |||
229 | /* | ||
230 | * Take a reference to the resource addressed by a key. | ||
231 | * Can be called while holding spinlocks. | ||
232 | * | ||
233 | * NOTE: mmap_sem MUST be held between get_futex_key() and calling this | ||
234 | * function, if it is called at all. mmap_sem keeps key->shared.inode valid. | ||
235 | */ | ||
236 | static inline void get_key_refs(union futex_key *key) | ||
237 | { | ||
238 | if (key->both.ptr != 0) { | ||
239 | if (key->both.offset & 1) | ||
240 | atomic_inc(&key->shared.inode->i_count); | ||
241 | else | ||
242 | atomic_inc(&key->private.mm->mm_count); | ||
243 | } | ||
244 | } | ||
245 | |||
246 | /* | ||
247 | * Drop a reference to the resource addressed by a key. | ||
248 | * The hash bucket spinlock must not be held. | ||
249 | */ | ||
250 | static void drop_key_refs(union futex_key *key) | ||
251 | { | ||
252 | if (key->both.ptr != 0) { | ||
253 | if (key->both.offset & 1) | ||
254 | iput(key->shared.inode); | ||
255 | else | ||
256 | mmdrop(key->private.mm); | ||
257 | } | ||
258 | } | ||
259 | |||
260 | static inline int get_futex_value_locked(int *dest, int __user *from) | ||
261 | { | ||
262 | int ret; | ||
263 | |||
264 | inc_preempt_count(); | ||
265 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | ||
266 | dec_preempt_count(); | ||
267 | |||
268 | return ret ? -EFAULT : 0; | ||
269 | } | ||
270 | |||
271 | /* | ||
272 | * The hash bucket lock must be held when this is called. | ||
273 | * Afterwards, the futex_q must not be accessed. | ||
274 | */ | ||
275 | static void wake_futex(struct futex_q *q) | ||
276 | { | ||
277 | list_del_init(&q->list); | ||
278 | if (q->filp) | ||
279 | send_sigio(&q->filp->f_owner, q->fd, POLL_IN); | ||
280 | /* | ||
281 | * The lock in wake_up_all() is a crucial memory barrier after the | ||
282 | * list_del_init() and also before assigning to q->lock_ptr. | ||
283 | */ | ||
284 | wake_up_all(&q->waiters); | ||
285 | /* | ||
286 | * The waiting task can free the futex_q as soon as this is written, | ||
287 | * without taking any locks. This must come last. | ||
288 | */ | ||
289 | q->lock_ptr = NULL; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * Wake up all waiters hashed on the physical page that is mapped | ||
294 | * to this virtual address: | ||
295 | */ | ||
296 | static int futex_wake(unsigned long uaddr, int nr_wake) | ||
297 | { | ||
298 | union futex_key key; | ||
299 | struct futex_hash_bucket *bh; | ||
300 | struct list_head *head; | ||
301 | struct futex_q *this, *next; | ||
302 | int ret; | ||
303 | |||
304 | down_read(¤t->mm->mmap_sem); | ||
305 | |||
306 | ret = get_futex_key(uaddr, &key); | ||
307 | if (unlikely(ret != 0)) | ||
308 | goto out; | ||
309 | |||
310 | bh = hash_futex(&key); | ||
311 | spin_lock(&bh->lock); | ||
312 | head = &bh->chain; | ||
313 | |||
314 | list_for_each_entry_safe(this, next, head, list) { | ||
315 | if (match_futex (&this->key, &key)) { | ||
316 | wake_futex(this); | ||
317 | if (++ret >= nr_wake) | ||
318 | break; | ||
319 | } | ||
320 | } | ||
321 | |||
322 | spin_unlock(&bh->lock); | ||
323 | out: | ||
324 | up_read(¤t->mm->mmap_sem); | ||
325 | return ret; | ||
326 | } | ||
327 | |||
328 | /* | ||
329 | * Requeue all waiters hashed on one physical page to another | ||
330 | * physical page. | ||
331 | */ | ||
332 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | ||
333 | int nr_wake, int nr_requeue, int *valp) | ||
334 | { | ||
335 | union futex_key key1, key2; | ||
336 | struct futex_hash_bucket *bh1, *bh2; | ||
337 | struct list_head *head1; | ||
338 | struct futex_q *this, *next; | ||
339 | int ret, drop_count = 0; | ||
340 | |||
341 | retry: | ||
342 | down_read(¤t->mm->mmap_sem); | ||
343 | |||
344 | ret = get_futex_key(uaddr1, &key1); | ||
345 | if (unlikely(ret != 0)) | ||
346 | goto out; | ||
347 | ret = get_futex_key(uaddr2, &key2); | ||
348 | if (unlikely(ret != 0)) | ||
349 | goto out; | ||
350 | |||
351 | bh1 = hash_futex(&key1); | ||
352 | bh2 = hash_futex(&key2); | ||
353 | |||
354 | if (bh1 < bh2) | ||
355 | spin_lock(&bh1->lock); | ||
356 | spin_lock(&bh2->lock); | ||
357 | if (bh1 > bh2) | ||
358 | spin_lock(&bh1->lock); | ||
359 | |||
360 | if (likely(valp != NULL)) { | ||
361 | int curval; | ||
362 | |||
363 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | ||
364 | |||
365 | if (unlikely(ret)) { | ||
366 | spin_unlock(&bh1->lock); | ||
367 | if (bh1 != bh2) | ||
368 | spin_unlock(&bh2->lock); | ||
369 | |||
370 | /* If we would have faulted, release mmap_sem, fault | ||
371 | * it in and start all over again. | ||
372 | */ | ||
373 | up_read(¤t->mm->mmap_sem); | ||
374 | |||
375 | ret = get_user(curval, (int __user *)uaddr1); | ||
376 | |||
377 | if (!ret) | ||
378 | goto retry; | ||
379 | |||
380 | return ret; | ||
381 | } | ||
382 | if (curval != *valp) { | ||
383 | ret = -EAGAIN; | ||
384 | goto out_unlock; | ||
385 | } | ||
386 | } | ||
387 | |||
388 | head1 = &bh1->chain; | ||
389 | list_for_each_entry_safe(this, next, head1, list) { | ||
390 | if (!match_futex (&this->key, &key1)) | ||
391 | continue; | ||
392 | if (++ret <= nr_wake) { | ||
393 | wake_futex(this); | ||
394 | } else { | ||
395 | list_move_tail(&this->list, &bh2->chain); | ||
396 | this->lock_ptr = &bh2->lock; | ||
397 | this->key = key2; | ||
398 | get_key_refs(&key2); | ||
399 | drop_count++; | ||
400 | |||
401 | if (ret - nr_wake >= nr_requeue) | ||
402 | break; | ||
403 | /* Make sure to stop if key1 == key2 */ | ||
404 | if (head1 == &bh2->chain && head1 != &next->list) | ||
405 | head1 = &this->list; | ||
406 | } | ||
407 | } | ||
408 | |||
409 | out_unlock: | ||
410 | spin_unlock(&bh1->lock); | ||
411 | if (bh1 != bh2) | ||
412 | spin_unlock(&bh2->lock); | ||
413 | |||
414 | /* drop_key_refs() must be called outside the spinlocks. */ | ||
415 | while (--drop_count >= 0) | ||
416 | drop_key_refs(&key1); | ||
417 | |||
418 | out: | ||
419 | up_read(¤t->mm->mmap_sem); | ||
420 | return ret; | ||
421 | } | ||
422 | |||
423 | /* The key must be already stored in q->key. */ | ||
424 | static inline struct futex_hash_bucket * | ||
425 | queue_lock(struct futex_q *q, int fd, struct file *filp) | ||
426 | { | ||
427 | struct futex_hash_bucket *bh; | ||
428 | |||
429 | q->fd = fd; | ||
430 | q->filp = filp; | ||
431 | |||
432 | init_waitqueue_head(&q->waiters); | ||
433 | |||
434 | get_key_refs(&q->key); | ||
435 | bh = hash_futex(&q->key); | ||
436 | q->lock_ptr = &bh->lock; | ||
437 | |||
438 | spin_lock(&bh->lock); | ||
439 | return bh; | ||
440 | } | ||
441 | |||
442 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | ||
443 | { | ||
444 | list_add_tail(&q->list, &bh->chain); | ||
445 | spin_unlock(&bh->lock); | ||
446 | } | ||
447 | |||
448 | static inline void | ||
449 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | ||
450 | { | ||
451 | spin_unlock(&bh->lock); | ||
452 | drop_key_refs(&q->key); | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * queue_me and unqueue_me must be called as a pair, each | ||
457 | * exactly once. They are called with the hashed spinlock held. | ||
458 | */ | ||
459 | |||
460 | /* The key must be already stored in q->key. */ | ||
461 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | ||
462 | { | ||
463 | struct futex_hash_bucket *bh; | ||
464 | bh = queue_lock(q, fd, filp); | ||
465 | __queue_me(q, bh); | ||
466 | } | ||
467 | |||
468 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | ||
469 | static int unqueue_me(struct futex_q *q) | ||
470 | { | ||
471 | int ret = 0; | ||
472 | spinlock_t *lock_ptr; | ||
473 | |||
474 | /* In the common case we don't take the spinlock, which is nice. */ | ||
475 | retry: | ||
476 | lock_ptr = q->lock_ptr; | ||
477 | if (lock_ptr != 0) { | ||
478 | spin_lock(lock_ptr); | ||
479 | /* | ||
480 | * q->lock_ptr can change between reading it and | ||
481 | * spin_lock(), causing us to take the wrong lock. This | ||
482 | * corrects the race condition. | ||
483 | * | ||
484 | * Reasoning goes like this: if we have the wrong lock, | ||
485 | * q->lock_ptr must have changed (maybe several times) | ||
486 | * between reading it and the spin_lock(). It can | ||
487 | * change again after the spin_lock() but only if it was | ||
488 | * already changed before the spin_lock(). It cannot, | ||
489 | * however, change back to the original value. Therefore | ||
490 | * we can detect whether we acquired the correct lock. | ||
491 | */ | ||
492 | if (unlikely(lock_ptr != q->lock_ptr)) { | ||
493 | spin_unlock(lock_ptr); | ||
494 | goto retry; | ||
495 | } | ||
496 | WARN_ON(list_empty(&q->list)); | ||
497 | list_del(&q->list); | ||
498 | spin_unlock(lock_ptr); | ||
499 | ret = 1; | ||
500 | } | ||
501 | |||
502 | drop_key_refs(&q->key); | ||
503 | return ret; | ||
504 | } | ||
505 | |||
506 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | ||
507 | { | ||
508 | DECLARE_WAITQUEUE(wait, current); | ||
509 | int ret, curval; | ||
510 | struct futex_q q; | ||
511 | struct futex_hash_bucket *bh; | ||
512 | |||
513 | retry: | ||
514 | down_read(¤t->mm->mmap_sem); | ||
515 | |||
516 | ret = get_futex_key(uaddr, &q.key); | ||
517 | if (unlikely(ret != 0)) | ||
518 | goto out_release_sem; | ||
519 | |||
520 | bh = queue_lock(&q, -1, NULL); | ||
521 | |||
522 | /* | ||
523 | * Access the page AFTER the futex is queued. | ||
524 | * Order is important: | ||
525 | * | ||
526 | * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); | ||
527 | * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } | ||
528 | * | ||
529 | * The basic logical guarantee of a futex is that it blocks ONLY | ||
530 | * if cond(var) is known to be true at the time of blocking, for | ||
531 | * any cond. If we queued after testing *uaddr, that would open | ||
532 | * a race condition where we could block indefinitely with | ||
533 | * cond(var) false, which would violate the guarantee. | ||
534 | * | ||
535 | * A consequence is that futex_wait() can return zero and absorb | ||
536 | * a wakeup when *uaddr != val on entry to the syscall. This is | ||
537 | * rare, but normal. | ||
538 | * | ||
539 | * We hold the mmap semaphore, so the mapping cannot have changed | ||
540 | * since we looked it up in get_futex_key. | ||
541 | */ | ||
542 | |||
543 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
544 | |||
545 | if (unlikely(ret)) { | ||
546 | queue_unlock(&q, bh); | ||
547 | |||
548 | /* If we would have faulted, release mmap_sem, fault it in and | ||
549 | * start all over again. | ||
550 | */ | ||
551 | up_read(¤t->mm->mmap_sem); | ||
552 | |||
553 | ret = get_user(curval, (int __user *)uaddr); | ||
554 | |||
555 | if (!ret) | ||
556 | goto retry; | ||
557 | return ret; | ||
558 | } | ||
559 | if (curval != val) { | ||
560 | ret = -EWOULDBLOCK; | ||
561 | queue_unlock(&q, bh); | ||
562 | goto out_release_sem; | ||
563 | } | ||
564 | |||
565 | /* Only actually queue if *uaddr contained val. */ | ||
566 | __queue_me(&q, bh); | ||
567 | |||
568 | /* | ||
569 | * Now the futex is queued and we have checked the data, we | ||
570 | * don't want to hold mmap_sem while we sleep. | ||
571 | */ | ||
572 | up_read(¤t->mm->mmap_sem); | ||
573 | |||
574 | /* | ||
575 | * There might have been scheduling since the queue_me(), as we | ||
576 | * cannot hold a spinlock across the get_user() in case it | ||
577 | * faults, and we cannot just set TASK_INTERRUPTIBLE state when | ||
578 | * queueing ourselves into the futex hash. This code thus has to | ||
579 | * rely on the futex_wake() code removing us from hash when it | ||
580 | * wakes us up. | ||
581 | */ | ||
582 | |||
583 | /* add_wait_queue is the barrier after __set_current_state. */ | ||
584 | __set_current_state(TASK_INTERRUPTIBLE); | ||
585 | add_wait_queue(&q.waiters, &wait); | ||
586 | /* | ||
587 | * !list_empty() is safe here without any lock. | ||
588 | * q.lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
589 | */ | ||
590 | if (likely(!list_empty(&q.list))) | ||
591 | time = schedule_timeout(time); | ||
592 | __set_current_state(TASK_RUNNING); | ||
593 | |||
594 | /* | ||
595 | * NOTE: we don't remove ourselves from the waitqueue because | ||
596 | * we are the only user of it. | ||
597 | */ | ||
598 | |||
599 | /* If we were woken (and unqueued), we succeeded, whatever. */ | ||
600 | if (!unqueue_me(&q)) | ||
601 | return 0; | ||
602 | if (time == 0) | ||
603 | return -ETIMEDOUT; | ||
604 | /* We expect signal_pending(current), but another thread may | ||
605 | * have handled it for us already. */ | ||
606 | return -EINTR; | ||
607 | |||
608 | out_release_sem: | ||
609 | up_read(¤t->mm->mmap_sem); | ||
610 | return ret; | ||
611 | } | ||
612 | |||
613 | static int futex_close(struct inode *inode, struct file *filp) | ||
614 | { | ||
615 | struct futex_q *q = filp->private_data; | ||
616 | |||
617 | unqueue_me(q); | ||
618 | kfree(q); | ||
619 | return 0; | ||
620 | } | ||
621 | |||
622 | /* This is one-shot: once it's gone off you need a new fd */ | ||
623 | static unsigned int futex_poll(struct file *filp, | ||
624 | struct poll_table_struct *wait) | ||
625 | { | ||
626 | struct futex_q *q = filp->private_data; | ||
627 | int ret = 0; | ||
628 | |||
629 | poll_wait(filp, &q->waiters, wait); | ||
630 | |||
631 | /* | ||
632 | * list_empty() is safe here without any lock. | ||
633 | * q->lock_ptr != 0 is not safe, because of ordering against wakeup. | ||
634 | */ | ||
635 | if (list_empty(&q->list)) | ||
636 | ret = POLLIN | POLLRDNORM; | ||
637 | |||
638 | return ret; | ||
639 | } | ||
640 | |||
641 | static struct file_operations futex_fops = { | ||
642 | .release = futex_close, | ||
643 | .poll = futex_poll, | ||
644 | }; | ||
645 | |||
646 | /* | ||
647 | * Signal allows caller to avoid the race which would occur if they | ||
648 | * set the sigio stuff up afterwards. | ||
649 | */ | ||
650 | static int futex_fd(unsigned long uaddr, int signal) | ||
651 | { | ||
652 | struct futex_q *q; | ||
653 | struct file *filp; | ||
654 | int ret, err; | ||
655 | |||
656 | ret = -EINVAL; | ||
657 | if (signal < 0 || signal > _NSIG) | ||
658 | goto out; | ||
659 | |||
660 | ret = get_unused_fd(); | ||
661 | if (ret < 0) | ||
662 | goto out; | ||
663 | filp = get_empty_filp(); | ||
664 | if (!filp) { | ||
665 | put_unused_fd(ret); | ||
666 | ret = -ENFILE; | ||
667 | goto out; | ||
668 | } | ||
669 | filp->f_op = &futex_fops; | ||
670 | filp->f_vfsmnt = mntget(futex_mnt); | ||
671 | filp->f_dentry = dget(futex_mnt->mnt_root); | ||
672 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | ||
673 | |||
674 | if (signal) { | ||
675 | int err; | ||
676 | err = f_setown(filp, current->pid, 1); | ||
677 | if (err < 0) { | ||
678 | put_unused_fd(ret); | ||
679 | put_filp(filp); | ||
680 | ret = err; | ||
681 | goto out; | ||
682 | } | ||
683 | filp->f_owner.signum = signal; | ||
684 | } | ||
685 | |||
686 | q = kmalloc(sizeof(*q), GFP_KERNEL); | ||
687 | if (!q) { | ||
688 | put_unused_fd(ret); | ||
689 | put_filp(filp); | ||
690 | ret = -ENOMEM; | ||
691 | goto out; | ||
692 | } | ||
693 | |||
694 | down_read(¤t->mm->mmap_sem); | ||
695 | err = get_futex_key(uaddr, &q->key); | ||
696 | |||
697 | if (unlikely(err != 0)) { | ||
698 | up_read(¤t->mm->mmap_sem); | ||
699 | put_unused_fd(ret); | ||
700 | put_filp(filp); | ||
701 | kfree(q); | ||
702 | return err; | ||
703 | } | ||
704 | |||
705 | /* | ||
706 | * queue_me() must be called before releasing mmap_sem, because | ||
707 | * key->shared.inode needs to be referenced while holding it. | ||
708 | */ | ||
709 | filp->private_data = q; | ||
710 | |||
711 | queue_me(q, ret, filp); | ||
712 | up_read(¤t->mm->mmap_sem); | ||
713 | |||
714 | /* Now we map fd to filp, so userspace can access it */ | ||
715 | fd_install(ret, filp); | ||
716 | out: | ||
717 | return ret; | ||
718 | } | ||
719 | |||
720 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | ||
721 | unsigned long uaddr2, int val2, int val3) | ||
722 | { | ||
723 | int ret; | ||
724 | |||
725 | switch (op) { | ||
726 | case FUTEX_WAIT: | ||
727 | ret = futex_wait(uaddr, val, timeout); | ||
728 | break; | ||
729 | case FUTEX_WAKE: | ||
730 | ret = futex_wake(uaddr, val); | ||
731 | break; | ||
732 | case FUTEX_FD: | ||
733 | /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ | ||
734 | ret = futex_fd(uaddr, val); | ||
735 | break; | ||
736 | case FUTEX_REQUEUE: | ||
737 | ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); | ||
738 | break; | ||
739 | case FUTEX_CMP_REQUEUE: | ||
740 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); | ||
741 | break; | ||
742 | default: | ||
743 | ret = -ENOSYS; | ||
744 | } | ||
745 | return ret; | ||
746 | } | ||
747 | |||
748 | |||
749 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | ||
750 | struct timespec __user *utime, u32 __user *uaddr2, | ||
751 | int val3) | ||
752 | { | ||
753 | struct timespec t; | ||
754 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | ||
755 | int val2 = 0; | ||
756 | |||
757 | if ((op == FUTEX_WAIT) && utime) { | ||
758 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | ||
759 | return -EFAULT; | ||
760 | timeout = timespec_to_jiffies(&t) + 1; | ||
761 | } | ||
762 | /* | ||
763 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | ||
764 | */ | ||
765 | if (op >= FUTEX_REQUEUE) | ||
766 | val2 = (int) (unsigned long) utime; | ||
767 | |||
768 | return do_futex((unsigned long)uaddr, op, val, timeout, | ||
769 | (unsigned long)uaddr2, val2, val3); | ||
770 | } | ||
771 | |||
772 | static struct super_block * | ||
773 | futexfs_get_sb(struct file_system_type *fs_type, | ||
774 | int flags, const char *dev_name, void *data) | ||
775 | { | ||
776 | return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); | ||
777 | } | ||
778 | |||
779 | static struct file_system_type futex_fs_type = { | ||
780 | .name = "futexfs", | ||
781 | .get_sb = futexfs_get_sb, | ||
782 | .kill_sb = kill_anon_super, | ||
783 | }; | ||
784 | |||
785 | static int __init init(void) | ||
786 | { | ||
787 | unsigned int i; | ||
788 | |||
789 | register_filesystem(&futex_fs_type); | ||
790 | futex_mnt = kern_mount(&futex_fs_type); | ||
791 | |||
792 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | ||
793 | INIT_LIST_HEAD(&futex_queues[i].chain); | ||
794 | spin_lock_init(&futex_queues[i].lock); | ||
795 | } | ||
796 | return 0; | ||
797 | } | ||
798 | __initcall(init); | ||