aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c988
1 files changed, 769 insertions, 219 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 600bc9d801f2..b7ce15c67e32 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -16,6 +16,9 @@
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 * 18 *
19 * PRIVATE futexes by Eric Dumazet
20 * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21 *
19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 22 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
20 * enough at me, Linus for the original (flawed) idea, Matthew 23 * enough at me, Linus for the original (flawed) idea, Matthew
21 * Kirkwood for proof-of-concept implementation. 24 * Kirkwood for proof-of-concept implementation.
@@ -53,6 +56,12 @@
53 56
54#include "rtmutex_common.h" 57#include "rtmutex_common.h"
55 58
59#ifdef CONFIG_DEBUG_RT_MUTEXES
60# include "rtmutex-debug.h"
61#else
62# include "rtmutex.h"
63#endif
64
56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 65#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
57 66
58/* 67/*
@@ -81,12 +90,12 @@ struct futex_pi_state {
81 * we can wake only the relevant ones (hashed queues may be shared). 90 * we can wake only the relevant ones (hashed queues may be shared).
82 * 91 *
83 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 92 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
84 * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. 93 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
85 * The order of wakup is always to make the first condition true, then 94 * The order of wakup is always to make the first condition true, then
86 * wake up q->waiters, then make the second condition true. 95 * wake up q->waiters, then make the second condition true.
87 */ 96 */
88struct futex_q { 97struct futex_q {
89 struct list_head list; 98 struct plist_node list;
90 wait_queue_head_t waiters; 99 wait_queue_head_t waiters;
91 100
92 /* Which hash list lock to use: */ 101 /* Which hash list lock to use: */
@@ -102,14 +111,20 @@ struct futex_q {
102 /* Optional priority inheritance state: */ 111 /* Optional priority inheritance state: */
103 struct futex_pi_state *pi_state; 112 struct futex_pi_state *pi_state;
104 struct task_struct *task; 113 struct task_struct *task;
114
115 /*
116 * This waiter is used in case of requeue from a
117 * normal futex to a PI-futex
118 */
119 struct rt_mutex_waiter waiter;
105}; 120};
106 121
107/* 122/*
108 * Split the global futex_lock into every hash list lock. 123 * Split the global futex_lock into every hash list lock.
109 */ 124 */
110struct futex_hash_bucket { 125struct futex_hash_bucket {
111 spinlock_t lock; 126 spinlock_t lock;
112 struct list_head chain; 127 struct plist_head chain;
113}; 128};
114 129
115static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 130static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -138,19 +153,26 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
138 && key1->both.offset == key2->both.offset); 153 && key1->both.offset == key2->both.offset);
139} 154}
140 155
141/* 156/**
142 * Get parameters which are the keys for a futex. 157 * get_futex_key - Get parameters which are the keys for a futex.
158 * @uaddr: virtual address of the futex
159 * @shared: NULL for a PROCESS_PRIVATE futex,
160 * &current->mm->mmap_sem for a PROCESS_SHARED futex
161 * @key: address where result is stored.
162 *
163 * Returns a negative error code or 0
164 * The key words are stored in *key on success.
143 * 165 *
144 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, 166 * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
145 * offset_within_page). For private mappings, it's (uaddr, current->mm). 167 * offset_within_page). For private mappings, it's (uaddr, current->mm).
146 * We can usually work out the index without swapping in the page. 168 * We can usually work out the index without swapping in the page.
147 * 169 *
148 * Returns: 0, or negative error code. 170 * fshared is NULL for PROCESS_PRIVATE futexes
149 * The key words are stored in *key on success. 171 * For other futexes, it points to &current->mm->mmap_sem and
150 * 172 * caller must have taken the reader lock. but NOT any spinlocks.
151 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
152 */ 173 */
153int get_futex_key(u32 __user *uaddr, union futex_key *key) 174int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
175 union futex_key *key)
154{ 176{
155 unsigned long address = (unsigned long)uaddr; 177 unsigned long address = (unsigned long)uaddr;
156 struct mm_struct *mm = current->mm; 178 struct mm_struct *mm = current->mm;
@@ -162,11 +184,25 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
162 * The futex address must be "naturally" aligned. 184 * The futex address must be "naturally" aligned.
163 */ 185 */
164 key->both.offset = address % PAGE_SIZE; 186 key->both.offset = address % PAGE_SIZE;
165 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 187 if (unlikely((address % sizeof(u32)) != 0))
166 return -EINVAL; 188 return -EINVAL;
167 address -= key->both.offset; 189 address -= key->both.offset;
168 190
169 /* 191 /*
192 * PROCESS_PRIVATE futexes are fast.
193 * As the mm cannot disappear under us and the 'key' only needs
194 * virtual address, we dont even have to find the underlying vma.
195 * Note : We do have to check 'uaddr' is a valid user address,
196 * but access_ok() should be faster than find_vma()
197 */
198 if (!fshared) {
199 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
200 return -EFAULT;
201 key->private.mm = mm;
202 key->private.address = address;
203 return 0;
204 }
205 /*
170 * The futex is hashed differently depending on whether 206 * The futex is hashed differently depending on whether
171 * it's in a shared or private mapping. So check vma first. 207 * it's in a shared or private mapping. So check vma first.
172 */ 208 */
@@ -180,6 +216,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
180 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 216 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
181 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 217 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
182 218
219 /* Save the user address in the ley */
220 key->uaddr = uaddr;
221
183 /* 222 /*
184 * Private mappings are handled in a simple way. 223 * Private mappings are handled in a simple way.
185 * 224 *
@@ -190,6 +229,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
190 * mappings of _writable_ handles. 229 * mappings of _writable_ handles.
191 */ 230 */
192 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 231 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
232 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
193 key->private.mm = mm; 233 key->private.mm = mm;
194 key->private.address = address; 234 key->private.address = address;
195 return 0; 235 return 0;
@@ -199,7 +239,7 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
199 * Linear file mappings are also simple. 239 * Linear file mappings are also simple.
200 */ 240 */
201 key->shared.inode = vma->vm_file->f_path.dentry->d_inode; 241 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
202 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 242 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
203 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 243 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
204 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) 244 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
205 + vma->vm_pgoff); 245 + vma->vm_pgoff);
@@ -227,16 +267,18 @@ EXPORT_SYMBOL_GPL(get_futex_key);
227 * Take a reference to the resource addressed by a key. 267 * Take a reference to the resource addressed by a key.
228 * Can be called while holding spinlocks. 268 * Can be called while holding spinlocks.
229 * 269 *
230 * NOTE: mmap_sem MUST be held between get_futex_key() and calling this
231 * function, if it is called at all. mmap_sem keeps key->shared.inode valid.
232 */ 270 */
233inline void get_futex_key_refs(union futex_key *key) 271inline void get_futex_key_refs(union futex_key *key)
234{ 272{
235 if (key->both.ptr != 0) { 273 if (key->both.ptr == 0)
236 if (key->both.offset & 1) 274 return;
275 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
276 case FUT_OFF_INODE:
237 atomic_inc(&key->shared.inode->i_count); 277 atomic_inc(&key->shared.inode->i_count);
238 else 278 break;
279 case FUT_OFF_MMSHARED:
239 atomic_inc(&key->private.mm->mm_count); 280 atomic_inc(&key->private.mm->mm_count);
281 break;
240 } 282 }
241} 283}
242EXPORT_SYMBOL_GPL(get_futex_key_refs); 284EXPORT_SYMBOL_GPL(get_futex_key_refs);
@@ -247,11 +289,15 @@ EXPORT_SYMBOL_GPL(get_futex_key_refs);
247 */ 289 */
248void drop_futex_key_refs(union futex_key *key) 290void drop_futex_key_refs(union futex_key *key)
249{ 291{
250 if (key->both.ptr != 0) { 292 if (key->both.ptr == 0)
251 if (key->both.offset & 1) 293 return;
294 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
295 case FUT_OFF_INODE:
252 iput(key->shared.inode); 296 iput(key->shared.inode);
253 else 297 break;
298 case FUT_OFF_MMSHARED:
254 mmdrop(key->private.mm); 299 mmdrop(key->private.mm);
300 break;
255 } 301 }
256} 302}
257EXPORT_SYMBOL_GPL(drop_futex_key_refs); 303EXPORT_SYMBOL_GPL(drop_futex_key_refs);
@@ -268,28 +314,38 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
268} 314}
269 315
270/* 316/*
271 * Fault handling. Called with current->mm->mmap_sem held. 317 * Fault handling.
318 * if fshared is non NULL, current->mm->mmap_sem is already held
272 */ 319 */
273static int futex_handle_fault(unsigned long address, int attempt) 320static int futex_handle_fault(unsigned long address,
321 struct rw_semaphore *fshared, int attempt)
274{ 322{
275 struct vm_area_struct * vma; 323 struct vm_area_struct * vma;
276 struct mm_struct *mm = current->mm; 324 struct mm_struct *mm = current->mm;
325 int ret = -EFAULT;
277 326
278 if (attempt > 2 || !(vma = find_vma(mm, address)) || 327 if (attempt > 2)
279 vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) 328 return ret;
280 return -EFAULT;
281 329
282 switch (handle_mm_fault(mm, vma, address, 1)) { 330 if (!fshared)
283 case VM_FAULT_MINOR: 331 down_read(&mm->mmap_sem);
284 current->min_flt++; 332 vma = find_vma(mm, address);
285 break; 333 if (vma && address >= vma->vm_start &&
286 case VM_FAULT_MAJOR: 334 (vma->vm_flags & VM_WRITE)) {
287 current->maj_flt++; 335 switch (handle_mm_fault(mm, vma, address, 1)) {
288 break; 336 case VM_FAULT_MINOR:
289 default: 337 ret = 0;
290 return -EFAULT; 338 current->min_flt++;
339 break;
340 case VM_FAULT_MAJOR:
341 ret = 0;
342 current->maj_flt++;
343 break;
344 }
291 } 345 }
292 return 0; 346 if (!fshared)
347 up_read(&mm->mmap_sem);
348 return ret;
293} 349}
294 350
295/* 351/*
@@ -439,18 +495,19 @@ void exit_pi_state_list(struct task_struct *curr)
439} 495}
440 496
441static int 497static int
442lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) 498lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
499 union futex_key *key, struct futex_pi_state **ps)
443{ 500{
444 struct futex_pi_state *pi_state = NULL; 501 struct futex_pi_state *pi_state = NULL;
445 struct futex_q *this, *next; 502 struct futex_q *this, *next;
446 struct list_head *head; 503 struct plist_head *head;
447 struct task_struct *p; 504 struct task_struct *p;
448 pid_t pid; 505 pid_t pid;
449 506
450 head = &hb->chain; 507 head = &hb->chain;
451 508
452 list_for_each_entry_safe(this, next, head, list) { 509 plist_for_each_entry_safe(this, next, head, list) {
453 if (match_futex(&this->key, &me->key)) { 510 if (match_futex(&this->key, key)) {
454 /* 511 /*
455 * Another waiter already exists - bump up 512 * Another waiter already exists - bump up
456 * the refcount and return its pi_state: 513 * the refcount and return its pi_state:
@@ -465,7 +522,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
465 WARN_ON(!atomic_read(&pi_state->refcount)); 522 WARN_ON(!atomic_read(&pi_state->refcount));
466 523
467 atomic_inc(&pi_state->refcount); 524 atomic_inc(&pi_state->refcount);
468 me->pi_state = pi_state; 525 *ps = pi_state;
469 526
470 return 0; 527 return 0;
471 } 528 }
@@ -492,7 +549,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
492 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 549 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
493 550
494 /* Store the key for possible exit cleanups: */ 551 /* Store the key for possible exit cleanups: */
495 pi_state->key = me->key; 552 pi_state->key = *key;
496 553
497 spin_lock_irq(&p->pi_lock); 554 spin_lock_irq(&p->pi_lock);
498 WARN_ON(!list_empty(&pi_state->list)); 555 WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +559,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
502 559
503 put_task_struct(p); 560 put_task_struct(p);
504 561
505 me->pi_state = pi_state; 562 *ps = pi_state;
506 563
507 return 0; 564 return 0;
508} 565}
@@ -513,12 +570,12 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
513 */ 570 */
514static void wake_futex(struct futex_q *q) 571static void wake_futex(struct futex_q *q)
515{ 572{
516 list_del_init(&q->list); 573 plist_del(&q->list, &q->list.plist);
517 if (q->filp) 574 if (q->filp)
518 send_sigio(&q->filp->f_owner, q->fd, POLL_IN); 575 send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
519 /* 576 /*
520 * The lock in wake_up_all() is a crucial memory barrier after the 577 * The lock in wake_up_all() is a crucial memory barrier after the
521 * list_del_init() and also before assigning to q->lock_ptr. 578 * plist_del() and also before assigning to q->lock_ptr.
522 */ 579 */
523 wake_up_all(&q->waiters); 580 wake_up_all(&q->waiters);
524 /* 581 /*
@@ -562,6 +619,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
562 */ 619 */
563 if (!(uval & FUTEX_OWNER_DIED)) { 620 if (!(uval & FUTEX_OWNER_DIED)) {
564 newval = FUTEX_WAITERS | new_owner->pid; 621 newval = FUTEX_WAITERS | new_owner->pid;
622 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
623 newval |= (uval & FUTEX_WAITER_REQUEUED);
565 624
566 pagefault_disable(); 625 pagefault_disable();
567 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -629,17 +688,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
629 * Wake up all waiters hashed on the physical page that is mapped 688 * Wake up all waiters hashed on the physical page that is mapped
630 * to this virtual address: 689 * to this virtual address:
631 */ 690 */
632static int futex_wake(u32 __user *uaddr, int nr_wake) 691static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
692 int nr_wake)
633{ 693{
634 struct futex_hash_bucket *hb; 694 struct futex_hash_bucket *hb;
635 struct futex_q *this, *next; 695 struct futex_q *this, *next;
636 struct list_head *head; 696 struct plist_head *head;
637 union futex_key key; 697 union futex_key key;
638 int ret; 698 int ret;
639 699
640 down_read(&current->mm->mmap_sem); 700 if (fshared)
701 down_read(fshared);
641 702
642 ret = get_futex_key(uaddr, &key); 703 ret = get_futex_key(uaddr, fshared, &key);
643 if (unlikely(ret != 0)) 704 if (unlikely(ret != 0))
644 goto out; 705 goto out;
645 706
@@ -647,7 +708,7 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
647 spin_lock(&hb->lock); 708 spin_lock(&hb->lock);
648 head = &hb->chain; 709 head = &hb->chain;
649 710
650 list_for_each_entry_safe(this, next, head, list) { 711 plist_for_each_entry_safe(this, next, head, list) {
651 if (match_futex (&this->key, &key)) { 712 if (match_futex (&this->key, &key)) {
652 if (this->pi_state) { 713 if (this->pi_state) {
653 ret = -EINVAL; 714 ret = -EINVAL;
@@ -661,7 +722,261 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
661 722
662 spin_unlock(&hb->lock); 723 spin_unlock(&hb->lock);
663out: 724out:
664 up_read(&current->mm->mmap_sem); 725 if (fshared)
726 up_read(fshared);
727 return ret;
728}
729
730/*
731 * Called from futex_requeue_pi.
732 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
733 * PI-futex value; search its associated pi_state if an owner exist
734 * or create a new one without owner.
735 */
736static inline int
737lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
738 union futex_key *key,
739 struct futex_pi_state **pi_state)
740{
741 u32 curval, uval, newval;
742
743retry:
744 /*
745 * We can't handle a fault cleanly because we can't
746 * release the locks here. Simply return the fault.
747 */
748 if (get_futex_value_locked(&curval, uaddr))
749 return -EFAULT;
750
751 /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
752 if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
753 != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
754 /*
755 * No waiters yet, we prepare the futex to have some waiters.
756 */
757
758 uval = curval;
759 newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
760
761 pagefault_disable();
762 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
763 pagefault_enable();
764
765 if (unlikely(curval == -EFAULT))
766 return -EFAULT;
767 if (unlikely(curval != uval))
768 goto retry;
769 }
770
771 if (!(curval & FUTEX_TID_MASK)
772 || lookup_pi_state(curval, hb, key, pi_state)) {
773 /* the futex has no owner (yet) or the lookup failed:
774 allocate one pi_state without owner */
775
776 *pi_state = alloc_pi_state();
777
778 /* Already stores the key: */
779 (*pi_state)->key = *key;
780
781 /* init the mutex without owner */
782 __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
783 }
784
785 return 0;
786}
787
788/*
789 * Keep the first nr_wake waiter from futex1, wake up one,
790 * and requeue the next nr_requeue waiters following hashed on
791 * one physical page to another physical page (PI-futex uaddr2)
792 */
793static int futex_requeue_pi(u32 __user *uaddr1,
794 struct rw_semaphore *fshared,
795 u32 __user *uaddr2,
796 int nr_wake, int nr_requeue, u32 *cmpval)
797{
798 union futex_key key1, key2;
799 struct futex_hash_bucket *hb1, *hb2;
800 struct plist_head *head1;
801 struct futex_q *this, *next;
802 struct futex_pi_state *pi_state2 = NULL;
803 struct rt_mutex_waiter *waiter, *top_waiter = NULL;
804 struct rt_mutex *lock2 = NULL;
805 int ret, drop_count = 0;
806
807 if (refill_pi_state_cache())
808 return -ENOMEM;
809
810retry:
811 /*
812 * First take all the futex related locks:
813 */
814 if (fshared)
815 down_read(fshared);
816
817 ret = get_futex_key(uaddr1, fshared, &key1);
818 if (unlikely(ret != 0))
819 goto out;
820 ret = get_futex_key(uaddr2, fshared, &key2);
821 if (unlikely(ret != 0))
822 goto out;
823
824 hb1 = hash_futex(&key1);
825 hb2 = hash_futex(&key2);
826
827 double_lock_hb(hb1, hb2);
828
829 if (likely(cmpval != NULL)) {
830 u32 curval;
831
832 ret = get_futex_value_locked(&curval, uaddr1);
833
834 if (unlikely(ret)) {
835 spin_unlock(&hb1->lock);
836 if (hb1 != hb2)
837 spin_unlock(&hb2->lock);
838
839 /*
840 * If we would have faulted, release mmap_sem, fault
841 * it in and start all over again.
842 */
843 if (fshared)
844 up_read(fshared);
845
846 ret = get_user(curval, uaddr1);
847
848 if (!ret)
849 goto retry;
850
851 return ret;
852 }
853 if (curval != *cmpval) {
854 ret = -EAGAIN;
855 goto out_unlock;
856 }
857 }
858
859 head1 = &hb1->chain;
860 plist_for_each_entry_safe(this, next, head1, list) {
861 if (!match_futex (&this->key, &key1))
862 continue;
863 if (++ret <= nr_wake) {
864 wake_futex(this);
865 } else {
866 /*
867 * FIRST: get and set the pi_state
868 */
869 if (!pi_state2) {
870 int s;
871 /* do this only the first time we requeue someone */
872 s = lookup_pi_state_for_requeue(uaddr2, hb2,
873 &key2, &pi_state2);
874 if (s) {
875 ret = s;
876 goto out_unlock;
877 }
878
879 lock2 = &pi_state2->pi_mutex;
880 spin_lock(&lock2->wait_lock);
881
882 /* Save the top waiter of the wait_list */
883 if (rt_mutex_has_waiters(lock2))
884 top_waiter = rt_mutex_top_waiter(lock2);
885 } else
886 atomic_inc(&pi_state2->refcount);
887
888
889 this->pi_state = pi_state2;
890
891 /*
892 * SECOND: requeue futex_q to the correct hashbucket
893 */
894
895 /*
896 * If key1 and key2 hash to the same bucket, no need to
897 * requeue.
898 */
899 if (likely(head1 != &hb2->chain)) {
900 plist_del(&this->list, &hb1->chain);
901 plist_add(&this->list, &hb2->chain);
902 this->lock_ptr = &hb2->lock;
903#ifdef CONFIG_DEBUG_PI_LIST
904 this->list.plist.lock = &hb2->lock;
905#endif
906 }
907 this->key = key2;
908 get_futex_key_refs(&key2);
909 drop_count++;
910
911
912 /*
913 * THIRD: queue it to lock2
914 */
915 spin_lock_irq(&this->task->pi_lock);
916 waiter = &this->waiter;
917 waiter->task = this->task;
918 waiter->lock = lock2;
919 plist_node_init(&waiter->list_entry, this->task->prio);
920 plist_node_init(&waiter->pi_list_entry, this->task->prio);
921 plist_add(&waiter->list_entry, &lock2->wait_list);
922 this->task->pi_blocked_on = waiter;
923 spin_unlock_irq(&this->task->pi_lock);
924
925 if (ret - nr_wake >= nr_requeue)
926 break;
927 }
928 }
929
930 /* If we've requeued some tasks and the top_waiter of the rt_mutex
931 has changed, we must adjust the priority of the owner, if any */
932 if (drop_count) {
933 struct task_struct *owner = rt_mutex_owner(lock2);
934 if (owner &&
935 (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
936 int chain_walk = 0;
937
938 spin_lock_irq(&owner->pi_lock);
939 if (top_waiter)
940 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
941 else
942 /*
943 * There was no waiters before the requeue,
944 * the flag must be updated
945 */
946 mark_rt_mutex_waiters(lock2);
947
948 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
949 __rt_mutex_adjust_prio(owner);
950 if (owner->pi_blocked_on) {
951 chain_walk = 1;
952 get_task_struct(owner);
953 }
954
955 spin_unlock_irq(&owner->pi_lock);
956 spin_unlock(&lock2->wait_lock);
957
958 if (chain_walk)
959 rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
960 current);
961 } else {
962 /* No owner or the top_waiter does not change */
963 mark_rt_mutex_waiters(lock2);
964 spin_unlock(&lock2->wait_lock);
965 }
966 }
967
968out_unlock:
969 spin_unlock(&hb1->lock);
970 if (hb1 != hb2)
971 spin_unlock(&hb2->lock);
972
973 /* drop_futex_key_refs() must be called outside the spinlocks. */
974 while (--drop_count >= 0)
975 drop_futex_key_refs(&key1);
976
977out:
978 if (fshared)
979 up_read(fshared);
665 return ret; 980 return ret;
666} 981}
667 982
@@ -670,22 +985,24 @@ out:
670 * to this virtual address: 985 * to this virtual address:
671 */ 986 */
672static int 987static int
673futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, 988futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared,
989 u32 __user *uaddr2,
674 int nr_wake, int nr_wake2, int op) 990 int nr_wake, int nr_wake2, int op)
675{ 991{
676 union futex_key key1, key2; 992 union futex_key key1, key2;
677 struct futex_hash_bucket *hb1, *hb2; 993 struct futex_hash_bucket *hb1, *hb2;
678 struct list_head *head; 994 struct plist_head *head;
679 struct futex_q *this, *next; 995 struct futex_q *this, *next;
680 int ret, op_ret, attempt = 0; 996 int ret, op_ret, attempt = 0;
681 997
682retryfull: 998retryfull:
683 down_read(&current->mm->mmap_sem); 999 if (fshared)
1000 down_read(fshared);
684 1001
685 ret = get_futex_key(uaddr1, &key1); 1002 ret = get_futex_key(uaddr1, fshared, &key1);
686 if (unlikely(ret != 0)) 1003 if (unlikely(ret != 0))
687 goto out; 1004 goto out;
688 ret = get_futex_key(uaddr2, &key2); 1005 ret = get_futex_key(uaddr2, fshared, &key2);
689 if (unlikely(ret != 0)) 1006 if (unlikely(ret != 0))
690 goto out; 1007 goto out;
691 1008
@@ -725,11 +1042,10 @@ retry:
725 * still holding the mmap_sem. 1042 * still holding the mmap_sem.
726 */ 1043 */
727 if (attempt++) { 1044 if (attempt++) {
728 if (futex_handle_fault((unsigned long)uaddr2, 1045 ret = futex_handle_fault((unsigned long)uaddr2,
729 attempt)) { 1046 fshared, attempt);
730 ret = -EFAULT; 1047 if (ret)
731 goto out; 1048 goto out;
732 }
733 goto retry; 1049 goto retry;
734 } 1050 }
735 1051
@@ -737,7 +1053,8 @@ retry:
737 * If we would have faulted, release mmap_sem, 1053 * If we would have faulted, release mmap_sem,
738 * fault it in and start all over again. 1054 * fault it in and start all over again.
739 */ 1055 */
740 up_read(&current->mm->mmap_sem); 1056 if (fshared)
1057 up_read(fshared);
741 1058
742 ret = get_user(dummy, uaddr2); 1059 ret = get_user(dummy, uaddr2);
743 if (ret) 1060 if (ret)
@@ -748,7 +1065,7 @@ retry:
748 1065
749 head = &hb1->chain; 1066 head = &hb1->chain;
750 1067
751 list_for_each_entry_safe(this, next, head, list) { 1068 plist_for_each_entry_safe(this, next, head, list) {
752 if (match_futex (&this->key, &key1)) { 1069 if (match_futex (&this->key, &key1)) {
753 wake_futex(this); 1070 wake_futex(this);
754 if (++ret >= nr_wake) 1071 if (++ret >= nr_wake)
@@ -760,7 +1077,7 @@ retry:
760 head = &hb2->chain; 1077 head = &hb2->chain;
761 1078
762 op_ret = 0; 1079 op_ret = 0;
763 list_for_each_entry_safe(this, next, head, list) { 1080 plist_for_each_entry_safe(this, next, head, list) {
764 if (match_futex (&this->key, &key2)) { 1081 if (match_futex (&this->key, &key2)) {
765 wake_futex(this); 1082 wake_futex(this);
766 if (++op_ret >= nr_wake2) 1083 if (++op_ret >= nr_wake2)
@@ -774,7 +1091,8 @@ retry:
774 if (hb1 != hb2) 1091 if (hb1 != hb2)
775 spin_unlock(&hb2->lock); 1092 spin_unlock(&hb2->lock);
776out: 1093out:
777 up_read(&current->mm->mmap_sem); 1094 if (fshared)
1095 up_read(fshared);
778 return ret; 1096 return ret;
779} 1097}
780 1098
@@ -782,22 +1100,24 @@ out:
782 * Requeue all waiters hashed on one physical page to another 1100 * Requeue all waiters hashed on one physical page to another
783 * physical page. 1101 * physical page.
784 */ 1102 */
785static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, 1103static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1104 u32 __user *uaddr2,
786 int nr_wake, int nr_requeue, u32 *cmpval) 1105 int nr_wake, int nr_requeue, u32 *cmpval)
787{ 1106{
788 union futex_key key1, key2; 1107 union futex_key key1, key2;
789 struct futex_hash_bucket *hb1, *hb2; 1108 struct futex_hash_bucket *hb1, *hb2;
790 struct list_head *head1; 1109 struct plist_head *head1;
791 struct futex_q *this, *next; 1110 struct futex_q *this, *next;
792 int ret, drop_count = 0; 1111 int ret, drop_count = 0;
793 1112
794 retry: 1113 retry:
795 down_read(&current->mm->mmap_sem); 1114 if (fshared)
1115 down_read(fshared);
796 1116
797 ret = get_futex_key(uaddr1, &key1); 1117 ret = get_futex_key(uaddr1, fshared, &key1);
798 if (unlikely(ret != 0)) 1118 if (unlikely(ret != 0))
799 goto out; 1119 goto out;
800 ret = get_futex_key(uaddr2, &key2); 1120 ret = get_futex_key(uaddr2, fshared, &key2);
801 if (unlikely(ret != 0)) 1121 if (unlikely(ret != 0))
802 goto out; 1122 goto out;
803 1123
@@ -820,7 +1140,8 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
820 * If we would have faulted, release mmap_sem, fault 1140 * If we would have faulted, release mmap_sem, fault
821 * it in and start all over again. 1141 * it in and start all over again.
822 */ 1142 */
823 up_read(&current->mm->mmap_sem); 1143 if (fshared)
1144 up_read(fshared);
824 1145
825 ret = get_user(curval, uaddr1); 1146 ret = get_user(curval, uaddr1);
826 1147
@@ -836,7 +1157,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
836 } 1157 }
837 1158
838 head1 = &hb1->chain; 1159 head1 = &hb1->chain;
839 list_for_each_entry_safe(this, next, head1, list) { 1160 plist_for_each_entry_safe(this, next, head1, list) {
840 if (!match_futex (&this->key, &key1)) 1161 if (!match_futex (&this->key, &key1))
841 continue; 1162 continue;
842 if (++ret <= nr_wake) { 1163 if (++ret <= nr_wake) {
@@ -847,9 +1168,13 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
847 * requeue. 1168 * requeue.
848 */ 1169 */
849 if (likely(head1 != &hb2->chain)) { 1170 if (likely(head1 != &hb2->chain)) {
850 list_move_tail(&this->list, &hb2->chain); 1171 plist_del(&this->list, &hb1->chain);
1172 plist_add(&this->list, &hb2->chain);
851 this->lock_ptr = &hb2->lock; 1173 this->lock_ptr = &hb2->lock;
852 } 1174#ifdef CONFIG_DEBUG_PI_LIST
1175 this->list.plist.lock = &hb2->lock;
1176#endif
1177 }
853 this->key = key2; 1178 this->key = key2;
854 get_futex_key_refs(&key2); 1179 get_futex_key_refs(&key2);
855 drop_count++; 1180 drop_count++;
@@ -869,7 +1194,8 @@ out_unlock:
869 drop_futex_key_refs(&key1); 1194 drop_futex_key_refs(&key1);
870 1195
871out: 1196out:
872 up_read(&current->mm->mmap_sem); 1197 if (fshared)
1198 up_read(fshared);
873 return ret; 1199 return ret;
874} 1200}
875 1201
@@ -894,7 +1220,23 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
894 1220
895static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1221static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
896{ 1222{
897 list_add_tail(&q->list, &hb->chain); 1223 int prio;
1224
1225 /*
1226 * The priority used to register this element is
1227 * - either the real thread-priority for the real-time threads
1228 * (i.e. threads with a priority lower than MAX_RT_PRIO)
1229 * - or MAX_RT_PRIO for non-RT threads.
1230 * Thus, all RT-threads are woken first in priority order, and
1231 * the others are woken last, in FIFO order.
1232 */
1233 prio = min(current->normal_prio, MAX_RT_PRIO);
1234
1235 plist_node_init(&q->list, prio);
1236#ifdef CONFIG_DEBUG_PI_LIST
1237 q->list.plist.lock = &hb->lock;
1238#endif
1239 plist_add(&q->list, &hb->chain);
898 q->task = current; 1240 q->task = current;
899 spin_unlock(&hb->lock); 1241 spin_unlock(&hb->lock);
900} 1242}
@@ -949,8 +1291,8 @@ static int unqueue_me(struct futex_q *q)
949 spin_unlock(lock_ptr); 1291 spin_unlock(lock_ptr);
950 goto retry; 1292 goto retry;
951 } 1293 }
952 WARN_ON(list_empty(&q->list)); 1294 WARN_ON(plist_node_empty(&q->list));
953 list_del(&q->list); 1295 plist_del(&q->list, &q->list.plist);
954 1296
955 BUG_ON(q->pi_state); 1297 BUG_ON(q->pi_state);
956 1298
@@ -964,39 +1306,104 @@ static int unqueue_me(struct futex_q *q)
964 1306
965/* 1307/*
966 * PI futexes can not be requeued and must remove themself from the 1308 * PI futexes can not be requeued and must remove themself from the
967 * hash bucket. The hash bucket lock is held on entry and dropped here. 1309 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1310 * and dropped here.
968 */ 1311 */
969static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) 1312static void unqueue_me_pi(struct futex_q *q)
970{ 1313{
971 WARN_ON(list_empty(&q->list)); 1314 WARN_ON(plist_node_empty(&q->list));
972 list_del(&q->list); 1315 plist_del(&q->list, &q->list.plist);
973 1316
974 BUG_ON(!q->pi_state); 1317 BUG_ON(!q->pi_state);
975 free_pi_state(q->pi_state); 1318 free_pi_state(q->pi_state);
976 q->pi_state = NULL; 1319 q->pi_state = NULL;
977 1320
978 spin_unlock(&hb->lock); 1321 spin_unlock(q->lock_ptr);
979 1322
980 drop_futex_key_refs(&q->key); 1323 drop_futex_key_refs(&q->key);
981} 1324}
982 1325
1326/*
1327 * Fixup the pi_state owner with current.
1328 *
1329 * The cur->mm semaphore must be held, it is released at return of this
1330 * function.
1331 */
1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1333 struct futex_q *q,
1334 struct futex_hash_bucket *hb,
1335 struct task_struct *curr)
1336{
1337 u32 newtid = curr->pid | FUTEX_WAITERS;
1338 struct futex_pi_state *pi_state = q->pi_state;
1339 u32 uval, curval, newval;
1340 int ret;
1341
1342 /* Owner died? */
1343 if (pi_state->owner != NULL) {
1344 spin_lock_irq(&pi_state->owner->pi_lock);
1345 WARN_ON(list_empty(&pi_state->list));
1346 list_del_init(&pi_state->list);
1347 spin_unlock_irq(&pi_state->owner->pi_lock);
1348 } else
1349 newtid |= FUTEX_OWNER_DIED;
1350
1351 pi_state->owner = curr;
1352
1353 spin_lock_irq(&curr->pi_lock);
1354 WARN_ON(!list_empty(&pi_state->list));
1355 list_add(&pi_state->list, &curr->pi_state_list);
1356 spin_unlock_irq(&curr->pi_lock);
1357
1358 /* Unqueue and drop the lock */
1359 unqueue_me_pi(q);
1360 if (fshared)
1361 up_read(fshared);
1362 /*
1363 * We own it, so we have to replace the pending owner
1364 * TID. This must be atomic as we have preserve the
1365 * owner died bit here.
1366 */
1367 ret = get_user(uval, uaddr);
1368 while (!ret) {
1369 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1370 newval |= (uval & FUTEX_WAITER_REQUEUED);
1371 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1372 uval, newval);
1373 if (curval == -EFAULT)
1374 ret = -EFAULT;
1375 if (curval == uval)
1376 break;
1377 uval = curval;
1378 }
1379 return ret;
1380}
1381
1382/*
1383 * In case we must use restart_block to restart a futex_wait,
1384 * we encode in the 'arg3' shared capability
1385 */
1386#define ARG3_SHARED 1
1387
983static long futex_wait_restart(struct restart_block *restart); 1388static long futex_wait_restart(struct restart_block *restart);
984static int futex_wait_abstime(u32 __user *uaddr, u32 val, 1389static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
985 int timed, unsigned long abs_time) 1390 u32 val, ktime_t *abs_time)
986{ 1391{
987 struct task_struct *curr = current; 1392 struct task_struct *curr = current;
988 DECLARE_WAITQUEUE(wait, curr); 1393 DECLARE_WAITQUEUE(wait, curr);
989 struct futex_hash_bucket *hb; 1394 struct futex_hash_bucket *hb;
990 struct futex_q q; 1395 struct futex_q q;
991 unsigned long time_left = 0;
992 u32 uval; 1396 u32 uval;
993 int ret; 1397 int ret;
1398 struct hrtimer_sleeper t, *to = NULL;
1399 int rem = 0;
994 1400
995 q.pi_state = NULL; 1401 q.pi_state = NULL;
996 retry: 1402 retry:
997 down_read(&curr->mm->mmap_sem); 1403 if (fshared)
1404 down_read(fshared);
998 1405
999 ret = get_futex_key(uaddr, &q.key); 1406 ret = get_futex_key(uaddr, fshared, &q.key);
1000 if (unlikely(ret != 0)) 1407 if (unlikely(ret != 0))
1001 goto out_release_sem; 1408 goto out_release_sem;
1002 1409
@@ -1019,8 +1426,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1019 * a wakeup when *uaddr != val on entry to the syscall. This is 1426 * a wakeup when *uaddr != val on entry to the syscall. This is
1020 * rare, but normal. 1427 * rare, but normal.
1021 * 1428 *
1022 * We hold the mmap semaphore, so the mapping cannot have changed 1429 * for shared futexes, we hold the mmap semaphore, so the mapping
1023 * since we looked it up in get_futex_key. 1430 * cannot have changed since we looked it up in get_futex_key.
1024 */ 1431 */
1025 ret = get_futex_value_locked(&uval, uaddr); 1432 ret = get_futex_value_locked(&uval, uaddr);
1026 1433
@@ -1031,7 +1438,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1031 * If we would have faulted, release mmap_sem, fault it in and 1438 * If we would have faulted, release mmap_sem, fault it in and
1032 * start all over again. 1439 * start all over again.
1033 */ 1440 */
1034 up_read(&curr->mm->mmap_sem); 1441 if (fshared)
1442 up_read(fshared);
1035 1443
1036 ret = get_user(uval, uaddr); 1444 ret = get_user(uval, uaddr);
1037 1445
@@ -1043,6 +1451,14 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1043 if (uval != val) 1451 if (uval != val)
1044 goto out_unlock_release_sem; 1452 goto out_unlock_release_sem;
1045 1453
1454 /*
1455 * This rt_mutex_waiter structure is prepared here and will
1456 * be used only if this task is requeued from a normal futex to
1457 * a PI-futex with futex_requeue_pi.
1458 */
1459 debug_rt_mutex_init_waiter(&q.waiter);
1460 q.waiter.task = NULL;
1461
1046 /* Only actually queue if *uaddr contained val. */ 1462 /* Only actually queue if *uaddr contained val. */
1047 __queue_me(&q, hb); 1463 __queue_me(&q, hb);
1048 1464
@@ -1050,7 +1466,8 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1050 * Now the futex is queued and we have checked the data, we 1466 * Now the futex is queued and we have checked the data, we
1051 * don't want to hold mmap_sem while we sleep. 1467 * don't want to hold mmap_sem while we sleep.
1052 */ 1468 */
1053 up_read(&curr->mm->mmap_sem); 1469 if (fshared)
1470 up_read(fshared);
1054 1471
1055 /* 1472 /*
1056 * There might have been scheduling since the queue_me(), as we 1473 * There might have been scheduling since the queue_me(), as we
@@ -1065,23 +1482,33 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1065 __set_current_state(TASK_INTERRUPTIBLE); 1482 __set_current_state(TASK_INTERRUPTIBLE);
1066 add_wait_queue(&q.waiters, &wait); 1483 add_wait_queue(&q.waiters, &wait);
1067 /* 1484 /*
1068 * !list_empty() is safe here without any lock. 1485 * !plist_node_empty() is safe here without any lock.
1069 * q.lock_ptr != 0 is not safe, because of ordering against wakeup. 1486 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
1070 */ 1487 */
1071 time_left = 0; 1488 if (likely(!plist_node_empty(&q.list))) {
1072 if (likely(!list_empty(&q.list))) { 1489 if (!abs_time)
1073 unsigned long rel_time; 1490 schedule();
1074 1491 else {
1075 if (timed) { 1492 to = &t;
1076 unsigned long now = jiffies; 1493 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1077 if (time_after(now, abs_time)) 1494 hrtimer_init_sleeper(&t, current);
1078 rel_time = 0; 1495 t.timer.expires = *abs_time;
1079 else
1080 rel_time = abs_time - now;
1081 } else
1082 rel_time = MAX_SCHEDULE_TIMEOUT;
1083 1496
1084 time_left = schedule_timeout(rel_time); 1497 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
1498
1499 /*
1500 * the timer could have already expired, in which
1501 * case current would be flagged for rescheduling.
1502 * Don't bother calling schedule.
1503 */
1504 if (likely(t.task))
1505 schedule();
1506
1507 hrtimer_cancel(&t.timer);
1508
1509 /* Flag if a timeout occured */
1510 rem = (t.task == NULL);
1511 }
1085 } 1512 }
1086 __set_current_state(TASK_RUNNING); 1513 __set_current_state(TASK_RUNNING);
1087 1514
@@ -1090,17 +1517,80 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1090 * we are the only user of it. 1517 * we are the only user of it.
1091 */ 1518 */
1092 1519
1520 if (q.pi_state) {
1521 /*
1522 * We were woken but have been requeued on a PI-futex.
1523 * We have to complete the lock acquisition by taking
1524 * the rtmutex.
1525 */
1526
1527 struct rt_mutex *lock = &q.pi_state->pi_mutex;
1528
1529 spin_lock(&lock->wait_lock);
1530 if (unlikely(q.waiter.task)) {
1531 remove_waiter(lock, &q.waiter);
1532 }
1533 spin_unlock(&lock->wait_lock);
1534
1535 if (rem)
1536 ret = -ETIMEDOUT;
1537 else
1538 ret = rt_mutex_timed_lock(lock, to, 1);
1539
1540 if (fshared)
1541 down_read(fshared);
1542 spin_lock(q.lock_ptr);
1543
1544 /*
1545 * Got the lock. We might not be the anticipated owner if we
1546 * did a lock-steal - fix up the PI-state in that case.
1547 */
1548 if (!ret && q.pi_state->owner != curr) {
1549 /*
1550 * We MUST play with the futex we were requeued on,
1551 * NOT the current futex.
1552 * We can retrieve it from the key of the pi_state
1553 */
1554 uaddr = q.pi_state->key.uaddr;
1555
1556 /* mmap_sem and hash_bucket lock are unlocked at
1557 return of this function */
1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1560 } else {
1561 /*
1562 * Catch the rare case, where the lock was released
1563 * when we were on the way back before we locked
1564 * the hash bucket.
1565 */
1566 if (ret && q.pi_state->owner == curr) {
1567 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1568 ret = 0;
1569 }
1570 /* Unqueue and drop the lock */
1571 unqueue_me_pi(&q);
1572 if (fshared)
1573 up_read(fshared);
1574 }
1575
1576 debug_rt_mutex_free_waiter(&q.waiter);
1577
1578 return ret;
1579 }
1580
1581 debug_rt_mutex_free_waiter(&q.waiter);
1582
1093 /* If we were woken (and unqueued), we succeeded, whatever. */ 1583 /* If we were woken (and unqueued), we succeeded, whatever. */
1094 if (!unqueue_me(&q)) 1584 if (!unqueue_me(&q))
1095 return 0; 1585 return 0;
1096 if (time_left == 0) 1586 if (rem)
1097 return -ETIMEDOUT; 1587 return -ETIMEDOUT;
1098 1588
1099 /* 1589 /*
1100 * We expect signal_pending(current), but another thread may 1590 * We expect signal_pending(current), but another thread may
1101 * have handled it for us already. 1591 * have handled it for us already.
1102 */ 1592 */
1103 if (time_left == MAX_SCHEDULE_TIMEOUT) 1593 if (!abs_time)
1104 return -ERESTARTSYS; 1594 return -ERESTARTSYS;
1105 else { 1595 else {
1106 struct restart_block *restart; 1596 struct restart_block *restart;
@@ -1108,8 +1598,10 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1108 restart->fn = futex_wait_restart; 1598 restart->fn = futex_wait_restart;
1109 restart->arg0 = (unsigned long)uaddr; 1599 restart->arg0 = (unsigned long)uaddr;
1110 restart->arg1 = (unsigned long)val; 1600 restart->arg1 = (unsigned long)val;
1111 restart->arg2 = (unsigned long)timed; 1601 restart->arg2 = (unsigned long)abs_time;
1112 restart->arg3 = abs_time; 1602 restart->arg3 = 0;
1603 if (fshared)
1604 restart->arg3 |= ARG3_SHARED;
1113 return -ERESTART_RESTARTBLOCK; 1605 return -ERESTART_RESTARTBLOCK;
1114 } 1606 }
1115 1607
@@ -1117,65 +1609,111 @@ static int futex_wait_abstime(u32 __user *uaddr, u32 val,
1117 queue_unlock(&q, hb); 1609 queue_unlock(&q, hb);
1118 1610
1119 out_release_sem: 1611 out_release_sem:
1120 up_read(&curr->mm->mmap_sem); 1612 if (fshared)
1613 up_read(fshared);
1121 return ret; 1614 return ret;
1122} 1615}
1123 1616
1124static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
1125{
1126 int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
1127 return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
1128}
1129 1617
1130static long futex_wait_restart(struct restart_block *restart) 1618static long futex_wait_restart(struct restart_block *restart)
1131{ 1619{
1132 u32 __user *uaddr = (u32 __user *)restart->arg0; 1620 u32 __user *uaddr = (u32 __user *)restart->arg0;
1133 u32 val = (u32)restart->arg1; 1621 u32 val = (u32)restart->arg1;
1134 int timed = (int)restart->arg2; 1622 ktime_t *abs_time = (ktime_t *)restart->arg2;
1135 unsigned long abs_time = restart->arg3; 1623 struct rw_semaphore *fshared = NULL;
1136 1624
1137 restart->fn = do_no_restart_syscall; 1625 restart->fn = do_no_restart_syscall;
1138 return (long)futex_wait_abstime(uaddr, val, timed, abs_time); 1626 if (restart->arg3 & ARG3_SHARED)
1627 fshared = &current->mm->mmap_sem;
1628 return (long)futex_wait(uaddr, fshared, val, abs_time);
1139} 1629}
1140 1630
1141 1631
1632static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1633 union futex_key *key, struct task_struct *p)
1634{
1635 struct plist_head *head;
1636 struct futex_q *this, *next;
1637 struct futex_pi_state *pi_state = NULL;
1638 struct rt_mutex *lock;
1639
1640 /* Search a waiter that should already exists */
1641
1642 head = &hb->chain;
1643
1644 plist_for_each_entry_safe(this, next, head, list) {
1645 if (match_futex (&this->key, key)) {
1646 pi_state = this->pi_state;
1647 break;
1648 }
1649 }
1650
1651 BUG_ON(!pi_state);
1652
1653 /* set p as pi_state's owner */
1654 lock = &pi_state->pi_mutex;
1655
1656 spin_lock(&lock->wait_lock);
1657 spin_lock_irq(&p->pi_lock);
1658
1659 list_add(&pi_state->list, &p->pi_state_list);
1660 pi_state->owner = p;
1661
1662
1663 /* set p as pi_mutex's owner */
1664 debug_rt_mutex_proxy_lock(lock, p);
1665 WARN_ON(rt_mutex_owner(lock));
1666 rt_mutex_set_owner(lock, p, 0);
1667 rt_mutex_deadlock_account_lock(lock, p);
1668
1669 plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
1670 &p->pi_waiters);
1671 __rt_mutex_adjust_prio(p);
1672
1673 spin_unlock_irq(&p->pi_lock);
1674 spin_unlock(&lock->wait_lock);
1675}
1676
1142/* 1677/*
1143 * Userspace tried a 0 -> TID atomic transition of the futex value 1678 * Userspace tried a 0 -> TID atomic transition of the futex value
1144 * and failed. The kernel side here does the whole locking operation: 1679 * and failed. The kernel side here does the whole locking operation:
1145 * if there are waiters then it will block, it does PI, etc. (Due to 1680 * if there are waiters then it will block, it does PI, etc. (Due to
1146 * races the kernel might see a 0 value of the futex too.) 1681 * races the kernel might see a 0 value of the futex too.)
1147 */ 1682 */
1148static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, 1683static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1149 long nsec, int trylock) 1684 int detect, ktime_t *time, int trylock)
1150{ 1685{
1151 struct hrtimer_sleeper timeout, *to = NULL; 1686 struct hrtimer_sleeper timeout, *to = NULL;
1152 struct task_struct *curr = current; 1687 struct task_struct *curr = current;
1153 struct futex_hash_bucket *hb; 1688 struct futex_hash_bucket *hb;
1154 u32 uval, newval, curval; 1689 u32 uval, newval, curval;
1155 struct futex_q q; 1690 struct futex_q q;
1156 int ret, attempt = 0; 1691 int ret, lock_held, attempt = 0;
1157 1692
1158 if (refill_pi_state_cache()) 1693 if (refill_pi_state_cache())
1159 return -ENOMEM; 1694 return -ENOMEM;
1160 1695
1161 if (sec != MAX_SCHEDULE_TIMEOUT) { 1696 if (time) {
1162 to = &timeout; 1697 to = &timeout;
1163 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 1698 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
1164 hrtimer_init_sleeper(to, current); 1699 hrtimer_init_sleeper(to, current);
1165 to->timer.expires = ktime_set(sec, nsec); 1700 to->timer.expires = *time;
1166 } 1701 }
1167 1702
1168 q.pi_state = NULL; 1703 q.pi_state = NULL;
1169 retry: 1704 retry:
1170 down_read(&curr->mm->mmap_sem); 1705 if (fshared)
1706 down_read(fshared);
1171 1707
1172 ret = get_futex_key(uaddr, &q.key); 1708 ret = get_futex_key(uaddr, fshared, &q.key);
1173 if (unlikely(ret != 0)) 1709 if (unlikely(ret != 0))
1174 goto out_release_sem; 1710 goto out_release_sem;
1175 1711
1176 hb = queue_lock(&q, -1, NULL); 1712 hb = queue_lock(&q, -1, NULL);
1177 1713
1178 retry_locked: 1714 retry_locked:
1715 lock_held = 0;
1716
1179 /* 1717 /*
1180 * To avoid races, we attempt to take the lock here again 1718 * To avoid races, we attempt to take the lock here again
1181 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1719 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1194,7 +1732,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1194 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1732 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1195 if (!detect && 0) 1733 if (!detect && 0)
1196 force_sig(SIGKILL, current); 1734 force_sig(SIGKILL, current);
1197 ret = -EDEADLK; 1735 /*
1736 * Normally, this check is done in user space.
1737 * In case of requeue, the owner may attempt to lock this futex,
1738 * even if the ownership has already been given by the previous
1739 * waker.
1740 * In the usual case, this is a case of deadlock, but not in case
1741 * of REQUEUE_PI.
1742 */
1743 if (!(curval & FUTEX_WAITER_REQUEUED))
1744 ret = -EDEADLK;
1198 goto out_unlock_release_sem; 1745 goto out_unlock_release_sem;
1199 } 1746 }
1200 1747
@@ -1206,7 +1753,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1206 goto out_unlock_release_sem; 1753 goto out_unlock_release_sem;
1207 1754
1208 uval = curval; 1755 uval = curval;
1209 newval = uval | FUTEX_WAITERS; 1756 /*
1757 * In case of a requeue, check if there already is an owner
1758 * If not, just take the futex.
1759 */
1760 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
1761 /* set current as futex owner */
1762 newval = curval | current->pid;
1763 lock_held = 1;
1764 } else
1765 /* Set the WAITERS flag, so the owner will know it has someone
1766 to wake at next unlock */
1767 newval = curval | FUTEX_WAITERS;
1210 1768
1211 pagefault_disable(); 1769 pagefault_disable();
1212 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1770 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1217,11 +1775,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1217 if (unlikely(curval != uval)) 1775 if (unlikely(curval != uval))
1218 goto retry_locked; 1776 goto retry_locked;
1219 1777
1778 if (lock_held) {
1779 set_pi_futex_owner(hb, &q.key, curr);
1780 goto out_unlock_release_sem;
1781 }
1782
1220 /* 1783 /*
1221 * We dont have the lock. Look up the PI state (or create it if 1784 * We dont have the lock. Look up the PI state (or create it if
1222 * we are the first waiter): 1785 * we are the first waiter):
1223 */ 1786 */
1224 ret = lookup_pi_state(uval, hb, &q); 1787 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1225 1788
1226 if (unlikely(ret)) { 1789 if (unlikely(ret)) {
1227 /* 1790 /*
@@ -1263,7 +1826,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1263 * Now the futex is queued and we have checked the data, we 1826 * Now the futex is queued and we have checked the data, we
1264 * don't want to hold mmap_sem while we sleep. 1827 * don't want to hold mmap_sem while we sleep.
1265 */ 1828 */
1266 up_read(&curr->mm->mmap_sem); 1829 if (fshared)
1830 up_read(fshared);
1267 1831
1268 WARN_ON(!q.pi_state); 1832 WARN_ON(!q.pi_state);
1269 /* 1833 /*
@@ -1277,52 +1841,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1277 ret = ret ? 0 : -EWOULDBLOCK; 1841 ret = ret ? 0 : -EWOULDBLOCK;
1278 } 1842 }
1279 1843
1280 down_read(&curr->mm->mmap_sem); 1844 if (fshared)
1845 down_read(fshared);
1281 spin_lock(q.lock_ptr); 1846 spin_lock(q.lock_ptr);
1282 1847
1283 /* 1848 /*
1284 * Got the lock. We might not be the anticipated owner if we 1849 * Got the lock. We might not be the anticipated owner if we
1285 * did a lock-steal - fix up the PI-state in that case. 1850 * did a lock-steal - fix up the PI-state in that case.
1286 */ 1851 */
1287 if (!ret && q.pi_state->owner != curr) { 1852 if (!ret && q.pi_state->owner != curr)
1288 u32 newtid = current->pid | FUTEX_WAITERS; 1853 /* mmap_sem is unlocked at return of this function */
1289 1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr);
1290 /* Owner died? */ 1855 else {
1291 if (q.pi_state->owner != NULL) {
1292 spin_lock_irq(&q.pi_state->owner->pi_lock);
1293 WARN_ON(list_empty(&q.pi_state->list));
1294 list_del_init(&q.pi_state->list);
1295 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1296 } else
1297 newtid |= FUTEX_OWNER_DIED;
1298
1299 q.pi_state->owner = current;
1300
1301 spin_lock_irq(&current->pi_lock);
1302 WARN_ON(!list_empty(&q.pi_state->list));
1303 list_add(&q.pi_state->list, &current->pi_state_list);
1304 spin_unlock_irq(&current->pi_lock);
1305
1306 /* Unqueue and drop the lock */
1307 unqueue_me_pi(&q, hb);
1308 up_read(&curr->mm->mmap_sem);
1309 /*
1310 * We own it, so we have to replace the pending owner
1311 * TID. This must be atomic as we have preserve the
1312 * owner died bit here.
1313 */
1314 ret = get_user(uval, uaddr);
1315 while (!ret) {
1316 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1317 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1318 uval, newval);
1319 if (curval == -EFAULT)
1320 ret = -EFAULT;
1321 if (curval == uval)
1322 break;
1323 uval = curval;
1324 }
1325 } else {
1326 /* 1856 /*
1327 * Catch the rare case, where the lock was released 1857 * Catch the rare case, where the lock was released
1328 * when we were on the way back before we locked 1858 * when we were on the way back before we locked
@@ -1333,8 +1863,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1333 ret = 0; 1863 ret = 0;
1334 } 1864 }
1335 /* Unqueue and drop the lock */ 1865 /* Unqueue and drop the lock */
1336 unqueue_me_pi(&q, hb); 1866 unqueue_me_pi(&q);
1337 up_read(&curr->mm->mmap_sem); 1867 if (fshared)
1868 up_read(fshared);
1338 } 1869 }
1339 1870
1340 if (!detect && ret == -EDEADLK && 0) 1871 if (!detect && ret == -EDEADLK && 0)
@@ -1346,7 +1877,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1346 queue_unlock(&q, hb); 1877 queue_unlock(&q, hb);
1347 1878
1348 out_release_sem: 1879 out_release_sem:
1349 up_read(&curr->mm->mmap_sem); 1880 if (fshared)
1881 up_read(fshared);
1350 return ret; 1882 return ret;
1351 1883
1352 uaddr_faulted: 1884 uaddr_faulted:
@@ -1357,15 +1889,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1357 * still holding the mmap_sem. 1889 * still holding the mmap_sem.
1358 */ 1890 */
1359 if (attempt++) { 1891 if (attempt++) {
1360 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 1892 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1361 ret = -EFAULT; 1893 attempt);
1894 if (ret)
1362 goto out_unlock_release_sem; 1895 goto out_unlock_release_sem;
1363 }
1364 goto retry_locked; 1896 goto retry_locked;
1365 } 1897 }
1366 1898
1367 queue_unlock(&q, hb); 1899 queue_unlock(&q, hb);
1368 up_read(&curr->mm->mmap_sem); 1900 if (fshared)
1901 up_read(fshared);
1369 1902
1370 ret = get_user(uval, uaddr); 1903 ret = get_user(uval, uaddr);
1371 if (!ret && (uval != -EFAULT)) 1904 if (!ret && (uval != -EFAULT))
@@ -1379,12 +1912,12 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1379 * This is the in-kernel slowpath: we look up the PI state (if any), 1912 * This is the in-kernel slowpath: we look up the PI state (if any),
1380 * and do the rt-mutex unlock. 1913 * and do the rt-mutex unlock.
1381 */ 1914 */
1382static int futex_unlock_pi(u32 __user *uaddr) 1915static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared)
1383{ 1916{
1384 struct futex_hash_bucket *hb; 1917 struct futex_hash_bucket *hb;
1385 struct futex_q *this, *next; 1918 struct futex_q *this, *next;
1386 u32 uval; 1919 u32 uval;
1387 struct list_head *head; 1920 struct plist_head *head;
1388 union futex_key key; 1921 union futex_key key;
1389 int ret, attempt = 0; 1922 int ret, attempt = 0;
1390 1923
@@ -1399,9 +1932,10 @@ retry:
1399 /* 1932 /*
1400 * First take all the futex related locks: 1933 * First take all the futex related locks:
1401 */ 1934 */
1402 down_read(&current->mm->mmap_sem); 1935 if (fshared)
1936 down_read(fshared);
1403 1937
1404 ret = get_futex_key(uaddr, &key); 1938 ret = get_futex_key(uaddr, fshared, &key);
1405 if (unlikely(ret != 0)) 1939 if (unlikely(ret != 0))
1406 goto out; 1940 goto out;
1407 1941
@@ -1435,7 +1969,7 @@ retry_locked:
1435 */ 1969 */
1436 head = &hb->chain; 1970 head = &hb->chain;
1437 1971
1438 list_for_each_entry_safe(this, next, head, list) { 1972 plist_for_each_entry_safe(this, next, head, list) {
1439 if (!match_futex (&this->key, &key)) 1973 if (!match_futex (&this->key, &key))
1440 continue; 1974 continue;
1441 ret = wake_futex_pi(uaddr, uval, this); 1975 ret = wake_futex_pi(uaddr, uval, this);
@@ -1460,7 +1994,8 @@ retry_locked:
1460out_unlock: 1994out_unlock:
1461 spin_unlock(&hb->lock); 1995 spin_unlock(&hb->lock);
1462out: 1996out:
1463 up_read(&current->mm->mmap_sem); 1997 if (fshared)
1998 up_read(fshared);
1464 1999
1465 return ret; 2000 return ret;
1466 2001
@@ -1472,15 +2007,16 @@ pi_faulted:
1472 * still holding the mmap_sem. 2007 * still holding the mmap_sem.
1473 */ 2008 */
1474 if (attempt++) { 2009 if (attempt++) {
1475 if (futex_handle_fault((unsigned long)uaddr, attempt)) { 2010 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1476 ret = -EFAULT; 2011 attempt);
2012 if (ret)
1477 goto out_unlock; 2013 goto out_unlock;
1478 }
1479 goto retry_locked; 2014 goto retry_locked;
1480 } 2015 }
1481 2016
1482 spin_unlock(&hb->lock); 2017 spin_unlock(&hb->lock);
1483 up_read(&current->mm->mmap_sem); 2018 if (fshared)
2019 up_read(fshared);
1484 2020
1485 ret = get_user(uval, uaddr); 2021 ret = get_user(uval, uaddr);
1486 if (!ret && (uval != -EFAULT)) 2022 if (!ret && (uval != -EFAULT))
@@ -1509,10 +2045,10 @@ static unsigned int futex_poll(struct file *filp,
1509 poll_wait(filp, &q->waiters, wait); 2045 poll_wait(filp, &q->waiters, wait);
1510 2046
1511 /* 2047 /*
1512 * list_empty() is safe here without any lock. 2048 * plist_node_empty() is safe here without any lock.
1513 * q->lock_ptr != 0 is not safe, because of ordering against wakeup. 2049 * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
1514 */ 2050 */
1515 if (list_empty(&q->list)) 2051 if (plist_node_empty(&q->list))
1516 ret = POLLIN | POLLRDNORM; 2052 ret = POLLIN | POLLRDNORM;
1517 2053
1518 return ret; 2054 return ret;
@@ -1532,6 +2068,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1532 struct futex_q *q; 2068 struct futex_q *q;
1533 struct file *filp; 2069 struct file *filp;
1534 int ret, err; 2070 int ret, err;
2071 struct rw_semaphore *fshared;
1535 static unsigned long printk_interval; 2072 static unsigned long printk_interval;
1536 2073
1537 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { 2074 if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
@@ -1573,11 +2110,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
1573 } 2110 }
1574 q->pi_state = NULL; 2111 q->pi_state = NULL;
1575 2112
1576 down_read(&current->mm->mmap_sem); 2113 fshared = &current->mm->mmap_sem;
1577 err = get_futex_key(uaddr, &q->key); 2114 down_read(fshared);
2115 err = get_futex_key(uaddr, fshared, &q->key);
1578 2116
1579 if (unlikely(err != 0)) { 2117 if (unlikely(err != 0)) {
1580 up_read(&current->mm->mmap_sem); 2118 up_read(fshared);
1581 kfree(q); 2119 kfree(q);
1582 goto error; 2120 goto error;
1583 } 2121 }
@@ -1589,7 +2127,7 @@ static int futex_fd(u32 __user *uaddr, int signal)
1589 filp->private_data = q; 2127 filp->private_data = q;
1590 2128
1591 queue_me(q, ret, filp); 2129 queue_me(q, ret, filp);
1592 up_read(&current->mm->mmap_sem); 2130 up_read(fshared);
1593 2131
1594 /* Now we map fd to filp, so userspace can access it */ 2132 /* Now we map fd to filp, so userspace can access it */
1595 fd_install(ret, filp); 2133 fd_install(ret, filp);
@@ -1702,6 +2240,8 @@ retry:
1702 * userspace. 2240 * userspace.
1703 */ 2241 */
1704 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2242 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2243 /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
2244 mval |= (uval & FUTEX_WAITER_REQUEUED);
1705 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2245 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1706 2246
1707 if (nval == -EFAULT) 2247 if (nval == -EFAULT)
@@ -1716,7 +2256,7 @@ retry:
1716 */ 2256 */
1717 if (!pi) { 2257 if (!pi) {
1718 if (uval & FUTEX_WAITERS) 2258 if (uval & FUTEX_WAITERS)
1719 futex_wake(uaddr, 1); 2259 futex_wake(uaddr, &curr->mm->mmap_sem, 1);
1720 } 2260 }
1721 } 2261 }
1722 return 0; 2262 return 0;
@@ -1772,7 +2312,8 @@ void exit_robust_list(struct task_struct *curr)
1772 return; 2312 return;
1773 2313
1774 if (pending) 2314 if (pending)
1775 handle_futex_death((void __user *)pending + futex_offset, curr, pip); 2315 handle_futex_death((void __user *)pending + futex_offset,
2316 curr, pip);
1776 2317
1777 while (entry != &head->list) { 2318 while (entry != &head->list) {
1778 /* 2319 /*
@@ -1798,39 +2339,47 @@ void exit_robust_list(struct task_struct *curr)
1798 } 2339 }
1799} 2340}
1800 2341
1801long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, 2342long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1802 u32 __user *uaddr2, u32 val2, u32 val3) 2343 u32 __user *uaddr2, u32 val2, u32 val3)
1803{ 2344{
1804 int ret; 2345 int ret;
2346 int cmd = op & FUTEX_CMD_MASK;
2347 struct rw_semaphore *fshared = NULL;
2348
2349 if (!(op & FUTEX_PRIVATE_FLAG))
2350 fshared = &current->mm->mmap_sem;
1805 2351
1806 switch (op) { 2352 switch (cmd) {
1807 case FUTEX_WAIT: 2353 case FUTEX_WAIT:
1808 ret = futex_wait(uaddr, val, timeout); 2354 ret = futex_wait(uaddr, fshared, val, timeout);
1809 break; 2355 break;
1810 case FUTEX_WAKE: 2356 case FUTEX_WAKE:
1811 ret = futex_wake(uaddr, val); 2357 ret = futex_wake(uaddr, fshared, val);
1812 break; 2358 break;
1813 case FUTEX_FD: 2359 case FUTEX_FD:
1814 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2360 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
1815 ret = futex_fd(uaddr, val); 2361 ret = futex_fd(uaddr, val);
1816 break; 2362 break;
1817 case FUTEX_REQUEUE: 2363 case FUTEX_REQUEUE:
1818 ret = futex_requeue(uaddr, uaddr2, val, val2, NULL); 2364 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
1819 break; 2365 break;
1820 case FUTEX_CMP_REQUEUE: 2366 case FUTEX_CMP_REQUEUE:
1821 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 2367 ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
1822 break; 2368 break;
1823 case FUTEX_WAKE_OP: 2369 case FUTEX_WAKE_OP:
1824 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 2370 ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
1825 break; 2371 break;
1826 case FUTEX_LOCK_PI: 2372 case FUTEX_LOCK_PI:
1827 ret = futex_lock_pi(uaddr, val, timeout, val2, 0); 2373 ret = futex_lock_pi(uaddr, fshared, val, timeout, 0);
1828 break; 2374 break;
1829 case FUTEX_UNLOCK_PI: 2375 case FUTEX_UNLOCK_PI:
1830 ret = futex_unlock_pi(uaddr); 2376 ret = futex_unlock_pi(uaddr, fshared);
1831 break; 2377 break;
1832 case FUTEX_TRYLOCK_PI: 2378 case FUTEX_TRYLOCK_PI:
1833 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); 2379 ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
2380 break;
2381 case FUTEX_CMP_REQUEUE_PI:
2382 ret = futex_requeue_pi(uaddr, fshared, uaddr2, val, val2, &val3);
1834 break; 2383 break;
1835 default: 2384 default:
1836 ret = -ENOSYS; 2385 ret = -ENOSYS;
@@ -1843,29 +2392,30 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1843 struct timespec __user *utime, u32 __user *uaddr2, 2392 struct timespec __user *utime, u32 __user *uaddr2,
1844 u32 val3) 2393 u32 val3)
1845{ 2394{
1846 struct timespec t; 2395 struct timespec ts;
1847 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 2396 ktime_t t, *tp = NULL;
1848 u32 val2 = 0; 2397 u32 val2 = 0;
2398 int cmd = op & FUTEX_CMD_MASK;
1849 2399
1850 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { 2400 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) {
1851 if (copy_from_user(&t, utime, sizeof(t)) != 0) 2401 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
1852 return -EFAULT; 2402 return -EFAULT;
1853 if (!timespec_valid(&t)) 2403 if (!timespec_valid(&ts))
1854 return -EINVAL; 2404 return -EINVAL;
1855 if (op == FUTEX_WAIT) 2405
1856 timeout = timespec_to_jiffies(&t) + 1; 2406 t = timespec_to_ktime(ts);
1857 else { 2407 if (cmd == FUTEX_WAIT)
1858 timeout = t.tv_sec; 2408 t = ktime_add(ktime_get(), t);
1859 val2 = t.tv_nsec; 2409 tp = &t;
1860 }
1861 } 2410 }
1862 /* 2411 /*
1863 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2412 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
1864 */ 2413 */
1865 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 2414 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE
2415 || cmd == FUTEX_CMP_REQUEUE_PI)
1866 val2 = (u32) (unsigned long) utime; 2416 val2 = (u32) (unsigned long) utime;
1867 2417
1868 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); 2418 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
1869} 2419}
1870 2420
1871static int futexfs_get_sb(struct file_system_type *fs_type, 2421static int futexfs_get_sb(struct file_system_type *fs_type,
@@ -1895,7 +2445,7 @@ static int __init init(void)
1895 } 2445 }
1896 2446
1897 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2447 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
1898 INIT_LIST_HEAD(&futex_queues[i].chain); 2448 plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock);
1899 spin_lock_init(&futex_queues[i].lock); 2449 spin_lock_init(&futex_queues[i].lock);
1900 } 2450 }
1901 return 0; 2451 return 0;