aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c1095
1 files changed, 937 insertions, 158 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,259 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt > 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 rcu_read_lock();
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->exit_state != 0) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 rcu_read_unlock();
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct list_head *next, *head = &curr->pi_state_list;
419 struct futex_pi_state *pi_state;
420 struct futex_hash_bucket *hb;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselves:
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 hb = hash_futex(&key);
435 spin_unlock_irq(&curr->pi_lock);
436
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 /*
441 * We dropped the pi-lock, so re-check whether this
442 * task still owns the PI-state:
443 */
444 if (head->next != next) {
445 spin_unlock(&hb->lock);
446 continue;
447 }
448
449 WARN_ON(pi_state->owner != curr);
450 WARN_ON(list_empty(&pi_state->list));
451 list_del_init(&pi_state->list);
452 pi_state->owner = NULL;
453 spin_unlock_irq(&curr->pi_lock);
454
455 rt_mutex_unlock(&pi_state->pi_mutex);
456
457 spin_unlock(&hb->lock);
458
459 spin_lock_irq(&curr->pi_lock);
460 }
461 spin_unlock_irq(&curr->pi_lock);
462}
463
464static int
465lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
466{
467 struct futex_pi_state *pi_state = NULL;
468 struct futex_q *this, *next;
469 struct list_head *head;
470 struct task_struct *p;
471 pid_t pid;
472
473 head = &hb->chain;
474
475 list_for_each_entry_safe(this, next, head, list) {
476 if (match_futex(&this->key, &me->key)) {
477 /*
478 * Another waiter already exists - bump up
479 * the refcount and return its pi_state:
480 */
481 pi_state = this->pi_state;
482 /*
483 * Userspace might have messed up non PI and PI futexes
484 */
485 if (unlikely(!pi_state))
486 return -EINVAL;
487
488 WARN_ON(!atomic_read(&pi_state->refcount));
489
490 atomic_inc(&pi_state->refcount);
491 me->pi_state = pi_state;
492
493 return 0;
494 }
495 }
496
497 /*
498 * We are the first waiter - try to look up the real owner and attach
499 * the new pi_state to it, but bail out when the owner died bit is set
500 * and TID = 0:
501 */
502 pid = uval & FUTEX_TID_MASK;
503 if (!pid && (uval & FUTEX_OWNER_DIED))
504 return -ESRCH;
505 p = futex_find_get_task(pid);
506 if (!p)
507 return -ESRCH;
508
509 pi_state = alloc_pi_state();
510
511 /*
512 * Initialize the pi_mutex in locked state and make 'p'
513 * the owner of it:
514 */
515 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
516
517 /* Store the key for possible exit cleanups: */
518 pi_state->key = me->key;
519
520 spin_lock_irq(&p->pi_lock);
521 WARN_ON(!list_empty(&pi_state->list));
522 list_add(&pi_state->list, &p->pi_state_list);
523 pi_state->owner = p;
524 spin_unlock_irq(&p->pi_lock);
525
526 put_task_struct(p);
527
528 me->pi_state = pi_state;
529
530 return 0;
531}
532
533/*
261 * The hash bucket lock must be held when this is called. 534 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 535 * Afterwards, the futex_q must not be accessed.
263 */ 536 */
@@ -284,16 +557,105 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 557 q->lock_ptr = NULL;
285} 558}
286 559
560static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
561{
562 struct task_struct *new_owner;
563 struct futex_pi_state *pi_state = this->pi_state;
564 u32 curval, newval;
565
566 if (!pi_state)
567 return -EINVAL;
568
569 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
570
571 /*
572 * This happens when we have stolen the lock and the original
573 * pending owner did not enqueue itself back on the rt_mutex.
574 * Thats not a tragedy. We know that way, that a lock waiter
575 * is on the fly. We make the futex_q waiter the pending owner.
576 */
577 if (!new_owner)
578 new_owner = this->task;
579
580 /*
581 * We pass it to the next owner. (The WAITERS bit is always
582 * kept enabled while there is PI state around. We must also
583 * preserve the owner died bit.)
584 */
585 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid;
587
588 inc_preempt_count();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count();
591 if (curval == -EFAULT)
592 return -EFAULT;
593 if (curval != uval)
594 return -EINVAL;
595 }
596
597 spin_lock_irq(&pi_state->owner->pi_lock);
598 WARN_ON(list_empty(&pi_state->list));
599 list_del_init(&pi_state->list);
600 spin_unlock_irq(&pi_state->owner->pi_lock);
601
602 spin_lock_irq(&new_owner->pi_lock);
603 WARN_ON(!list_empty(&pi_state->list));
604 list_add(&pi_state->list, &new_owner->pi_state_list);
605 pi_state->owner = new_owner;
606 spin_unlock_irq(&new_owner->pi_lock);
607
608 rt_mutex_unlock(&pi_state->pi_mutex);
609
610 return 0;
611}
612
613static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
614{
615 u32 oldval;
616
617 /*
618 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner:
620 */
621 inc_preempt_count();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count();
624
625 if (oldval == -EFAULT)
626 return oldval;
627 if (oldval != uval)
628 return -EAGAIN;
629
630 return 0;
631}
632
633/*
634 * Express the locking dependencies for lockdep:
635 */
636static inline void
637double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
638{
639 if (hb1 <= hb2) {
640 spin_lock(&hb1->lock);
641 if (hb1 < hb2)
642 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
643 } else { /* hb1 > hb2 */
644 spin_lock(&hb2->lock);
645 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
646 }
647}
648
287/* 649/*
288 * Wake up all waiters hashed on the physical page that is mapped 650 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 651 * to this virtual address:
290 */ 652 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 653static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 654{
293 union futex_key key; 655 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 656 struct futex_q *this, *next;
657 struct list_head *head;
658 union futex_key key;
297 int ret; 659 int ret;
298 660
299 down_read(&current->mm->mmap_sem); 661 down_read(&current->mm->mmap_sem);
@@ -302,19 +664,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 664 if (unlikely(ret != 0))
303 goto out; 665 goto out;
304 666
305 bh = hash_futex(&key); 667 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 668 spin_lock(&hb->lock);
307 head = &bh->chain; 669 head = &hb->chain;
308 670
309 list_for_each_entry_safe(this, next, head, list) { 671 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 672 if (match_futex (&this->key, &key)) {
673 if (this->pi_state) {
674 ret = -EINVAL;
675 break;
676 }
311 wake_futex(this); 677 wake_futex(this);
312 if (++ret >= nr_wake) 678 if (++ret >= nr_wake)
313 break; 679 break;
314 } 680 }
315 } 681 }
316 682
317 spin_unlock(&bh->lock); 683 spin_unlock(&hb->lock);
318out: 684out:
319 up_read(&current->mm->mmap_sem); 685 up_read(&current->mm->mmap_sem);
320 return ret; 686 return ret;
@@ -324,10 +690,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 690 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 691 * to this virtual address:
326 */ 692 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 693static int
694futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
695 int nr_wake, int nr_wake2, int op)
328{ 696{
329 union futex_key key1, key2; 697 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 698 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 699 struct list_head *head;
332 struct futex_q *this, *next; 700 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 701 int ret, op_ret, attempt = 0;
@@ -342,27 +710,25 @@ retryfull:
342 if (unlikely(ret != 0)) 710 if (unlikely(ret != 0))
343 goto out; 711 goto out;
344 712
345 bh1 = hash_futex(&key1); 713 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 714 hb2 = hash_futex(&key2);
347 715
348retry: 716retry:
349 if (bh1 < bh2) 717 double_lock_hb(hb1, hb2);
350 spin_lock(&bh1->lock);
351 spin_lock(&bh2->lock);
352 if (bh1 > bh2)
353 spin_lock(&bh1->lock);
354 718
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 719 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 720 if (unlikely(op_ret < 0)) {
357 int dummy; 721 u32 dummy;
358 722
359 spin_unlock(&bh1->lock); 723 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 724 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 725 spin_unlock(&hb2->lock);
362 726
363#ifndef CONFIG_MMU 727#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 728 /*
365 * but we might get them from range checking */ 729 * we don't get EFAULT from MMU faults if we don't have an MMU,
730 * but we might get them from range checking
731 */
366 ret = op_ret; 732 ret = op_ret;
367 goto out; 733 goto out;
368#endif 734#endif
@@ -372,47 +738,36 @@ retry:
372 goto out; 738 goto out;
373 } 739 }
374 740
375 /* futex_atomic_op_inuser needs to both read and write 741 /*
742 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 743 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 744 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 745 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 746 * still holding the mmap_sem.
747 */
380 if (attempt++) { 748 if (attempt++) {
381 struct vm_area_struct * vma; 749 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 750 attempt)) {
383 751 ret = -EFAULT;
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 752 goto out;
400 } 753 }
401 goto retry; 754 goto retry;
402 } 755 }
403 756
404 /* If we would have faulted, release mmap_sem, 757 /*
405 * fault it in and start all over again. */ 758 * If we would have faulted, release mmap_sem,
759 * fault it in and start all over again.
760 */
406 up_read(&current->mm->mmap_sem); 761 up_read(&current->mm->mmap_sem);
407 762
408 ret = get_user(dummy, (int __user *)uaddr2); 763 ret = get_user(dummy, uaddr2);
409 if (ret) 764 if (ret)
410 return ret; 765 return ret;
411 766
412 goto retryfull; 767 goto retryfull;
413 } 768 }
414 769
415 head = &bh1->chain; 770 head = &hb1->chain;
416 771
417 list_for_each_entry_safe(this, next, head, list) { 772 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 773 if (match_futex (&this->key, &key1)) {
@@ -423,7 +778,7 @@ retry:
423 } 778 }
424 779
425 if (op_ret > 0) { 780 if (op_ret > 0) {
426 head = &bh2->chain; 781 head = &hb2->chain;
427 782
428 op_ret = 0; 783 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 784 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +791,9 @@ retry:
436 ret += op_ret; 791 ret += op_ret;
437 } 792 }
438 793
439 spin_unlock(&bh1->lock); 794 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 795 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 796 spin_unlock(&hb2->lock);
442out: 797out:
443 up_read(&current->mm->mmap_sem); 798 up_read(&current->mm->mmap_sem);
444 return ret; 799 return ret;
@@ -448,11 +803,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 803 * Requeue all waiters hashed on one physical page to another
449 * physical page. 804 * physical page.
450 */ 805 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 806static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 807 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 808{
454 union futex_key key1, key2; 809 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 810 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 811 struct list_head *head1;
457 struct futex_q *this, *next; 812 struct futex_q *this, *next;
458 int ret, drop_count = 0; 813 int ret, drop_count = 0;
@@ -467,68 +822,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 822 if (unlikely(ret != 0))
468 goto out; 823 goto out;
469 824
470 bh1 = hash_futex(&key1); 825 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 826 hb2 = hash_futex(&key2);
472 827
473 if (bh1 < bh2) 828 double_lock_hb(hb1, hb2);
474 spin_lock(&bh1->lock);
475 spin_lock(&bh2->lock);
476 if (bh1 > bh2)
477 spin_lock(&bh1->lock);
478 829
479 if (likely(valp != NULL)) { 830 if (likely(cmpval != NULL)) {
480 int curval; 831 u32 curval;
481 832
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 833 ret = get_futex_value_locked(&curval, uaddr1);
483 834
484 if (unlikely(ret)) { 835 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 836 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 837 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 838 spin_unlock(&hb2->lock);
488 839
489 /* If we would have faulted, release mmap_sem, fault 840 /*
841 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 842 * it in and start all over again.
491 */ 843 */
492 up_read(&current->mm->mmap_sem); 844 up_read(&current->mm->mmap_sem);
493 845
494 ret = get_user(curval, (int __user *)uaddr1); 846 ret = get_user(curval, uaddr1);
495 847
496 if (!ret) 848 if (!ret)
497 goto retry; 849 goto retry;
498 850
499 return ret; 851 return ret;
500 } 852 }
501 if (curval != *valp) { 853 if (curval != *cmpval) {
502 ret = -EAGAIN; 854 ret = -EAGAIN;
503 goto out_unlock; 855 goto out_unlock;
504 } 856 }
505 } 857 }
506 858
507 head1 = &bh1->chain; 859 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 860 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 861 if (!match_futex (&this->key, &key1))
510 continue; 862 continue;
511 if (++ret <= nr_wake) { 863 if (++ret <= nr_wake) {
512 wake_futex(this); 864 wake_futex(this);
513 } else { 865 } else {
514 list_move_tail(&this->list, &bh2->chain); 866 /*
515 this->lock_ptr = &bh2->lock; 867 * If key1 and key2 hash to the same bucket, no need to
868 * requeue.
869 */
870 if (likely(head1 != &hb2->chain)) {
871 list_move_tail(&this->list, &hb2->chain);
872 this->lock_ptr = &hb2->lock;
873 }
516 this->key = key2; 874 this->key = key2;
517 get_key_refs(&key2); 875 get_key_refs(&key2);
518 drop_count++; 876 drop_count++;
519 877
520 if (ret - nr_wake >= nr_requeue) 878 if (ret - nr_wake >= nr_requeue)
521 break; 879 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 880 }
526 } 881 }
527 882
528out_unlock: 883out_unlock:
529 spin_unlock(&bh1->lock); 884 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 885 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 886 spin_unlock(&hb2->lock);
532 887
533 /* drop_key_refs() must be called outside the spinlocks. */ 888 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 889 while (--drop_count >= 0)
@@ -543,7 +898,7 @@ out:
543static inline struct futex_hash_bucket * 898static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 899queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 900{
546 struct futex_hash_bucket *bh; 901 struct futex_hash_bucket *hb;
547 902
548 q->fd = fd; 903 q->fd = fd;
549 q->filp = filp; 904 q->filp = filp;
@@ -551,23 +906,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 906 init_waitqueue_head(&q->waiters);
552 907
553 get_key_refs(&q->key); 908 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 909 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 910 q->lock_ptr = &hb->lock;
556 911
557 spin_lock(&bh->lock); 912 spin_lock(&hb->lock);
558 return bh; 913 return hb;
559} 914}
560 915
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 916static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 917{
563 list_add_tail(&q->list, &bh->chain); 918 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 919 q->task = current;
920 spin_unlock(&hb->lock);
565} 921}
566 922
567static inline void 923static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 924queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 925{
570 spin_unlock(&bh->lock); 926 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 927 drop_key_refs(&q->key);
572} 928}
573 929
@@ -579,20 +935,22 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 935/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 936static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 937{
582 struct futex_hash_bucket *bh; 938 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 939
584 __queue_me(q, bh); 940 hb = queue_lock(q, fd, filp);
941 __queue_me(q, hb);
585} 942}
586 943
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 944/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 945static int unqueue_me(struct futex_q *q)
589{ 946{
590 int ret = 0;
591 spinlock_t *lock_ptr; 947 spinlock_t *lock_ptr;
948 int ret = 0;
592 949
593 /* In the common case we don't take the spinlock, which is nice. */ 950 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 951 retry:
595 lock_ptr = q->lock_ptr; 952 lock_ptr = q->lock_ptr;
953 barrier();
596 if (lock_ptr != 0) { 954 if (lock_ptr != 0) {
597 spin_lock(lock_ptr); 955 spin_lock(lock_ptr);
598 /* 956 /*
@@ -614,6 +972,9 @@ static int unqueue_me(struct futex_q *q)
614 } 972 }
615 WARN_ON(list_empty(&q->list)); 973 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 974 list_del(&q->list);
975
976 BUG_ON(q->pi_state);
977
617 spin_unlock(lock_ptr); 978 spin_unlock(lock_ptr);
618 ret = 1; 979 ret = 1;
619 } 980 }
@@ -622,21 +983,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 983 return ret;
623} 984}
624 985
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 986/*
987 * PI futexes can not be requeued and must remove themself from the
988 * hash bucket. The hash bucket lock is held on entry and dropped here.
989 */
990static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
991{
992 WARN_ON(list_empty(&q->list));
993 list_del(&q->list);
994
995 BUG_ON(!q->pi_state);
996 free_pi_state(q->pi_state);
997 q->pi_state = NULL;
998
999 spin_unlock(&hb->lock);
1000
1001 drop_key_refs(&q->key);
1002}
1003
1004static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
626{ 1005{
627 DECLARE_WAITQUEUE(wait, current); 1006 struct task_struct *curr = current;
628 int ret, curval; 1007 DECLARE_WAITQUEUE(wait, curr);
1008 struct futex_hash_bucket *hb;
629 struct futex_q q; 1009 struct futex_q q;
630 struct futex_hash_bucket *bh; 1010 u32 uval;
1011 int ret;
631 1012
1013 q.pi_state = NULL;
632 retry: 1014 retry:
633 down_read(&current->mm->mmap_sem); 1015 down_read(&curr->mm->mmap_sem);
634 1016
635 ret = get_futex_key(uaddr, &q.key); 1017 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 1018 if (unlikely(ret != 0))
637 goto out_release_sem; 1019 goto out_release_sem;
638 1020
639 bh = queue_lock(&q, -1, NULL); 1021 hb = queue_lock(&q, -1, NULL);
640 1022
641 /* 1023 /*
642 * Access the page AFTER the futex is queued. 1024 * Access the page AFTER the futex is queued.
@@ -658,37 +1040,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1040 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1041 * since we looked it up in get_futex_key.
660 */ 1042 */
661 1043 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1044
664 if (unlikely(ret)) { 1045 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1046 queue_unlock(&q, hb);
666 1047
667 /* If we would have faulted, release mmap_sem, fault it in and 1048 /*
1049 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1050 * start all over again.
669 */ 1051 */
670 up_read(&current->mm->mmap_sem); 1052 up_read(&curr->mm->mmap_sem);
671 1053
672 ret = get_user(curval, (int __user *)uaddr); 1054 ret = get_user(uval, uaddr);
673 1055
674 if (!ret) 1056 if (!ret)
675 goto retry; 1057 goto retry;
676 return ret; 1058 return ret;
677 } 1059 }
678 if (curval != val) { 1060 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1061 if (uval != val)
680 queue_unlock(&q, bh); 1062 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1063
684 /* Only actually queue if *uaddr contained val. */ 1064 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1065 __queue_me(&q, hb);
686 1066
687 /* 1067 /*
688 * Now the futex is queued and we have checked the data, we 1068 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1069 * don't want to hold mmap_sem while we sleep.
690 */ 1070 */
691 up_read(&current->mm->mmap_sem); 1071 up_read(&curr->mm->mmap_sem);
692 1072
693 /* 1073 /*
694 * There might have been scheduling since the queue_me(), as we 1074 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1100,367 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1100 return 0;
721 if (time == 0) 1101 if (time == 0)
722 return -ETIMEDOUT; 1102 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1103 /*
724 * have handled it for us already. */ 1104 * We expect signal_pending(current), but another thread may
1105 * have handled it for us already.
1106 */
725 return -EINTR; 1107 return -EINTR;
726 1108
1109 out_unlock_release_sem:
1110 queue_unlock(&q, hb);
1111
727 out_release_sem: 1112 out_release_sem:
1113 up_read(&curr->mm->mmap_sem);
1114 return ret;
1115}
1116
1117/*
1118 * Userspace tried a 0 -> TID atomic transition of the futex value
1119 * and failed. The kernel side here does the whole locking operation:
1120 * if there are waiters then it will block, it does PI, etc. (Due to
1121 * races the kernel might see a 0 value of the futex too.)
1122 */
1123static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1124 long nsec, int trylock)
1125{
1126 struct hrtimer_sleeper timeout, *to = NULL;
1127 struct task_struct *curr = current;
1128 struct futex_hash_bucket *hb;
1129 u32 uval, newval, curval;
1130 struct futex_q q;
1131 int ret, attempt = 0;
1132
1133 if (refill_pi_state_cache())
1134 return -ENOMEM;
1135
1136 if (sec != MAX_SCHEDULE_TIMEOUT) {
1137 to = &timeout;
1138 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1139 hrtimer_init_sleeper(to, current);
1140 to->timer.expires = ktime_set(sec, nsec);
1141 }
1142
1143 q.pi_state = NULL;
1144 retry:
1145 down_read(&curr->mm->mmap_sem);
1146
1147 ret = get_futex_key(uaddr, &q.key);
1148 if (unlikely(ret != 0))
1149 goto out_release_sem;
1150
1151 hb = queue_lock(&q, -1, NULL);
1152
1153 retry_locked:
1154 /*
1155 * To avoid races, we attempt to take the lock here again
1156 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1157 * the locks. It will most likely not succeed.
1158 */
1159 newval = current->pid;
1160
1161 inc_preempt_count();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count();
1164
1165 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted;
1167
1168 /* We own the lock already */
1169 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1170 if (!detect && 0)
1171 force_sig(SIGKILL, current);
1172 ret = -EDEADLK;
1173 goto out_unlock_release_sem;
1174 }
1175
1176 /*
1177 * Surprise - we got the lock. Just return
1178 * to userspace:
1179 */
1180 if (unlikely(!curval))
1181 goto out_unlock_release_sem;
1182
1183 uval = curval;
1184 newval = uval | FUTEX_WAITERS;
1185
1186 inc_preempt_count();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count();
1189
1190 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted;
1192 if (unlikely(curval != uval))
1193 goto retry_locked;
1194
1195 /*
1196 * We dont have the lock. Look up the PI state (or create it if
1197 * we are the first waiter):
1198 */
1199 ret = lookup_pi_state(uval, hb, &q);
1200
1201 if (unlikely(ret)) {
1202 /*
1203 * There were no waiters and the owner task lookup
1204 * failed. When the OWNER_DIED bit is set, then we
1205 * know that this is a robust futex and we actually
1206 * take the lock. This is safe as we are protected by
1207 * the hash bucket lock. We also set the waiters bit
1208 * unconditionally here, to simplify glibc handling of
1209 * multiple tasks racing to acquire the lock and
1210 * cleanup the problems which were left by the dead
1211 * owner.
1212 */
1213 if (curval & FUTEX_OWNER_DIED) {
1214 uval = newval;
1215 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217
1218 inc_preempt_count();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval);
1221 dec_preempt_count();
1222
1223 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted;
1225 if (unlikely(curval != uval))
1226 goto retry_locked;
1227 ret = 0;
1228 }
1229 goto out_unlock_release_sem;
1230 }
1231
1232 /*
1233 * Only actually queue now that the atomic ops are done:
1234 */
1235 __queue_me(&q, hb);
1236
1237 /*
1238 * Now the futex is queued and we have checked the data, we
1239 * don't want to hold mmap_sem while we sleep.
1240 */
1241 up_read(&curr->mm->mmap_sem);
1242
1243 WARN_ON(!q.pi_state);
1244 /*
1245 * Block on the PI mutex:
1246 */
1247 if (!trylock)
1248 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1249 else {
1250 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1251 /* Fixup the trylock return value: */
1252 ret = ret ? 0 : -EWOULDBLOCK;
1253 }
1254
1255 down_read(&curr->mm->mmap_sem);
1256 spin_lock(q.lock_ptr);
1257
1258 /*
1259 * Got the lock. We might not be the anticipated owner if we
1260 * did a lock-steal - fix up the PI-state in that case.
1261 */
1262 if (!ret && q.pi_state->owner != curr) {
1263 u32 newtid = current->pid | FUTEX_WAITERS;
1264
1265 /* Owner died? */
1266 if (q.pi_state->owner != NULL) {
1267 spin_lock_irq(&q.pi_state->owner->pi_lock);
1268 WARN_ON(list_empty(&q.pi_state->list));
1269 list_del_init(&q.pi_state->list);
1270 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1271 } else
1272 newtid |= FUTEX_OWNER_DIED;
1273
1274 q.pi_state->owner = current;
1275
1276 spin_lock_irq(&current->pi_lock);
1277 WARN_ON(!list_empty(&q.pi_state->list));
1278 list_add(&q.pi_state->list, &current->pi_state_list);
1279 spin_unlock_irq(&current->pi_lock);
1280
1281 /* Unqueue and drop the lock */
1282 unqueue_me_pi(&q, hb);
1283 up_read(&curr->mm->mmap_sem);
1284 /*
1285 * We own it, so we have to replace the pending owner
1286 * TID. This must be atomic as we have preserve the
1287 * owner died bit here.
1288 */
1289 ret = get_user(uval, uaddr);
1290 while (!ret) {
1291 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1292 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1293 uval, newval);
1294 if (curval == -EFAULT)
1295 ret = -EFAULT;
1296 if (curval == uval)
1297 break;
1298 uval = curval;
1299 }
1300 } else {
1301 /*
1302 * Catch the rare case, where the lock was released
1303 * when we were on the way back before we locked
1304 * the hash bucket.
1305 */
1306 if (ret && q.pi_state->owner == curr) {
1307 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1308 ret = 0;
1309 }
1310 /* Unqueue and drop the lock */
1311 unqueue_me_pi(&q, hb);
1312 up_read(&curr->mm->mmap_sem);
1313 }
1314
1315 if (!detect && ret == -EDEADLK && 0)
1316 force_sig(SIGKILL, current);
1317
1318 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1319
1320 out_unlock_release_sem:
1321 queue_unlock(&q, hb);
1322
1323 out_release_sem:
1324 up_read(&curr->mm->mmap_sem);
1325 return ret;
1326
1327 uaddr_faulted:
1328 /*
1329 * We have to r/w *(int __user *)uaddr, but we can't modify it
1330 * non-atomically. Therefore, if get_user below is not
1331 * enough, we need to handle the fault ourselves, while
1332 * still holding the mmap_sem.
1333 */
1334 if (attempt++) {
1335 if (futex_handle_fault((unsigned long)uaddr, attempt)) {
1336 ret = -EFAULT;
1337 goto out_unlock_release_sem;
1338 }
1339 goto retry_locked;
1340 }
1341
1342 queue_unlock(&q, hb);
1343 up_read(&curr->mm->mmap_sem);
1344
1345 ret = get_user(uval, uaddr);
1346 if (!ret && (uval != -EFAULT))
1347 goto retry;
1348
1349 return ret;
1350}
1351
1352/*
1353 * Userspace attempted a TID -> 0 atomic transition, and failed.
1354 * This is the in-kernel slowpath: we look up the PI state (if any),
1355 * and do the rt-mutex unlock.
1356 */
1357static int futex_unlock_pi(u32 __user *uaddr)
1358{
1359 struct futex_hash_bucket *hb;
1360 struct futex_q *this, *next;
1361 u32 uval;
1362 struct list_head *head;
1363 union futex_key key;
1364 int ret, attempt = 0;
1365
1366retry:
1367 if (get_user(uval, uaddr))
1368 return -EFAULT;
1369 /*
1370 * We release only a lock we actually own:
1371 */
1372 if ((uval & FUTEX_TID_MASK) != current->pid)
1373 return -EPERM;
1374 /*
1375 * First take all the futex related locks:
1376 */
1377 down_read(&current->mm->mmap_sem);
1378
1379 ret = get_futex_key(uaddr, &key);
1380 if (unlikely(ret != 0))
1381 goto out;
1382
1383 hb = hash_futex(&key);
1384 spin_lock(&hb->lock);
1385
1386retry_locked:
1387 /*
1388 * To avoid races, try to do the TID -> 0 atomic transition
1389 * again. If it succeeds then we can return without waking
1390 * anyone else up:
1391 */
1392 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count();
1396 }
1397
1398 if (unlikely(uval == -EFAULT))
1399 goto pi_faulted;
1400 /*
1401 * Rare case: we managed to release the lock atomically,
1402 * no need to wake anyone else up:
1403 */
1404 if (unlikely(uval == current->pid))
1405 goto out_unlock;
1406
1407 /*
1408 * Ok, other tasks may need to be woken up - check waiters
1409 * and do the wakeup if necessary:
1410 */
1411 head = &hb->chain;
1412
1413 list_for_each_entry_safe(this, next, head, list) {
1414 if (!match_futex (&this->key, &key))
1415 continue;
1416 ret = wake_futex_pi(uaddr, uval, this);
1417 /*
1418 * The atomic access to the futex value
1419 * generated a pagefault, so retry the
1420 * user-access and the wakeup:
1421 */
1422 if (ret == -EFAULT)
1423 goto pi_faulted;
1424 goto out_unlock;
1425 }
1426 /*
1427 * No waiters - kernel unlocks the futex:
1428 */
1429 if (!(uval & FUTEX_OWNER_DIED)) {
1430 ret = unlock_futex_pi(uaddr, uval);
1431 if (ret == -EFAULT)
1432 goto pi_faulted;
1433 }
1434
1435out_unlock:
1436 spin_unlock(&hb->lock);
1437out:
728 up_read(&current->mm->mmap_sem); 1438 up_read(&current->mm->mmap_sem);
1439
1440 return ret;
1441
1442pi_faulted:
1443 /*
1444 * We have to r/w *(int __user *)uaddr, but we can't modify it
1445 * non-atomically. Therefore, if get_user below is not
1446 * enough, we need to handle the fault ourselves, while
1447 * still holding the mmap_sem.
1448 */
1449 if (attempt++) {
1450 if (futex_handle_fault((unsigned long)uaddr, attempt)) {
1451 ret = -EFAULT;
1452 goto out_unlock;
1453 }
1454 goto retry_locked;
1455 }
1456
1457 spin_unlock(&hb->lock);
1458 up_read(&current->mm->mmap_sem);
1459
1460 ret = get_user(uval, uaddr);
1461 if (!ret && (uval != -EFAULT))
1462 goto retry;
1463
729 return ret; 1464 return ret;
730} 1465}
731 1466
@@ -735,6 +1470,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1470
736 unqueue_me(q); 1471 unqueue_me(q);
737 kfree(q); 1472 kfree(q);
1473
738 return 0; 1474 return 0;
739} 1475}
740 1476
@@ -766,7 +1502,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1502 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1503 * set the sigio stuff up afterwards.
768 */ 1504 */
769static int futex_fd(unsigned long uaddr, int signal) 1505static int futex_fd(u32 __user *uaddr, int signal)
770{ 1506{
771 struct futex_q *q; 1507 struct futex_q *q;
772 struct file *filp; 1508 struct file *filp;
@@ -803,6 +1539,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1539 err = -ENOMEM;
804 goto error; 1540 goto error;
805 } 1541 }
1542 q->pi_state = NULL;
806 1543
807 down_read(&current->mm->mmap_sem); 1544 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1545 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1577,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1577 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1578 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1579 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1580 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1581 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1582 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1583 * field, to allow the kernel to clean up if the thread dies after
@@ -887,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
887 struct task_struct *p; 1624 struct task_struct *p;
888 1625
889 ret = -ESRCH; 1626 ret = -ESRCH;
890 read_lock(&tasklist_lock); 1627 rcu_read_lock();
891 p = find_task_by_pid(pid); 1628 p = find_task_by_pid(pid);
892 if (!p) 1629 if (!p)
893 goto err_unlock; 1630 goto err_unlock;
@@ -896,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
896 !capable(CAP_SYS_PTRACE)) 1633 !capable(CAP_SYS_PTRACE))
897 goto err_unlock; 1634 goto err_unlock;
898 head = p->robust_list; 1635 head = p->robust_list;
899 read_unlock(&tasklist_lock); 1636 rcu_read_unlock();
900 } 1637 }
901 1638
902 if (put_user(sizeof(*head), len_ptr)) 1639 if (put_user(sizeof(*head), len_ptr))
@@ -904,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
904 return put_user(head, head_ptr); 1641 return put_user(head, head_ptr);
905 1642
906err_unlock: 1643err_unlock:
907 read_unlock(&tasklist_lock); 1644 rcu_read_unlock();
908 1645
909 return ret; 1646 return ret;
910} 1647}
@@ -913,9 +1650,9 @@ err_unlock:
913 * Process a futex-list entry, check whether it's owned by the 1650 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so: 1651 * dying task, and do notification if so:
915 */ 1652 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1653int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
917{ 1654{
918 u32 uval; 1655 u32 uval, nval, mval;
919 1656
920retry: 1657retry:
921 if (get_user(uval, uaddr)) 1658 if (get_user(uval, uaddr))
@@ -932,17 +1669,45 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1669 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1670 * userspace.
934 */ 1671 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1672 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
936 uval | FUTEX_OWNER_DIED) != uval) 1673 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1674
1675 if (nval == -EFAULT)
1676 return -1;
1677
1678 if (nval != uval)
937 goto retry; 1679 goto retry;
938 1680
939 if (uval & FUTEX_WAITERS) 1681 /*
940 futex_wake((unsigned long)uaddr, 1); 1682 * Wake robust non-PI futexes here. The wakeup of
1683 * PI futexes happens in exit_pi_state():
1684 */
1685 if (!pi) {
1686 if (uval & FUTEX_WAITERS)
1687 futex_wake(uaddr, 1);
1688 }
941 } 1689 }
942 return 0; 1690 return 0;
943} 1691}
944 1692
945/* 1693/*
1694 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1695 */
1696static inline int fetch_robust_entry(struct robust_list __user **entry,
1697 struct robust_list __user **head, int *pi)
1698{
1699 unsigned long uentry;
1700
1701 if (get_user(uentry, (unsigned long *)head))
1702 return -EFAULT;
1703
1704 *entry = (void *)(uentry & ~1UL);
1705 *pi = uentry & 1;
1706
1707 return 0;
1708}
1709
1710/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!) 1711 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters. 1712 * and mark any locks found there dead, and notify any waiters.
948 * 1713 *
@@ -952,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr)
952{ 1717{
953 struct robust_list_head __user *head = curr->robust_list; 1718 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending; 1719 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT; 1720 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
956 unsigned long futex_offset; 1721 unsigned long futex_offset;
957 1722
958 /* 1723 /*
959 * Fetch the list head (which was registered earlier, via 1724 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()): 1725 * sys_set_robust_list()):
961 */ 1726 */
962 if (get_user(entry, &head->list.next)) 1727 if (fetch_robust_entry(&entry, &head->list.next, &pi))
963 return; 1728 return;
964 /* 1729 /*
965 * Fetch the relative futex offset: 1730 * Fetch the relative futex offset:
@@ -970,24 +1735,25 @@ void exit_robust_list(struct task_struct *curr)
970 * Fetch any possibly pending lock-add first, and handle it 1735 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists: 1736 * if it exists:
972 */ 1737 */
973 if (get_user(pending, &head->list_op_pending)) 1738 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
974 return; 1739 return;
1740
975 if (pending) 1741 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr); 1742 handle_futex_death((void *)pending + futex_offset, curr, pip);
977 1743
978 while (entry != &head->list) { 1744 while (entry != &head->list) {
979 /* 1745 /*
980 * A pending lock might already be on the list, so 1746 * A pending lock might already be on the list, so
981 * dont process it twice: 1747 * don't process it twice:
982 */ 1748 */
983 if (entry != pending) 1749 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1750 if (handle_futex_death((void *)entry + futex_offset,
985 curr)) 1751 curr, pi))
986 return; 1752 return;
987 /* 1753 /*
988 * Fetch the next entry in the list: 1754 * Fetch the next entry in the list:
989 */ 1755 */
990 if (get_user(entry, &entry->next)) 1756 if (fetch_robust_entry(&entry, &entry->next, &pi))
991 return; 1757 return;
992 /* 1758 /*
993 * Avoid excessively long or circular lists: 1759 * Avoid excessively long or circular lists:
@@ -999,8 +1765,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1765 }
1000} 1766}
1001 1767
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1768long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1769 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1770{
1005 int ret; 1771 int ret;
1006 1772
@@ -1024,6 +1790,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1790 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1791 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1792 break;
1793 case FUTEX_LOCK_PI:
1794 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1795 break;
1796 case FUTEX_UNLOCK_PI:
1797 ret = futex_unlock_pi(uaddr);
1798 break;
1799 case FUTEX_TRYLOCK_PI:
1800 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1801 break;
1027 default: 1802 default:
1028 ret = -ENOSYS; 1803 ret = -ENOSYS;
1029 } 1804 }
@@ -1031,29 +1806,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1806}
1032 1807
1033 1808
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1809asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1810 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1811 u32 val3)
1037{ 1812{
1038 struct timespec t; 1813 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1814 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1815 u32 val2 = 0;
1041 1816
1042 if (utime && (op == FUTEX_WAIT)) { 1817 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1818 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1819 return -EFAULT;
1045 if (!timespec_valid(&t)) 1820 if (!timespec_valid(&t))
1046 return -EINVAL; 1821 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1822 if (op == FUTEX_WAIT)
1823 timeout = timespec_to_jiffies(&t) + 1;
1824 else {
1825 timeout = t.tv_sec;
1826 val2 = t.tv_nsec;
1827 }
1048 } 1828 }
1049 /* 1829 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1830 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1831 */
1052 if (op >= FUTEX_REQUEUE) 1832 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1833 val2 = (u32) (unsigned long) utime;
1054 1834
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1835 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1836}
1058 1837
1059static int futexfs_get_sb(struct file_system_type *fs_type, 1838static int futexfs_get_sb(struct file_system_type *fs_type,