aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPierre Peiffer <pierre.peiffer@bull.net>2007-05-09 05:35:02 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-09 15:30:55 -0400
commitd0aa7a70bf03b9de9e995ab272293be1f7937822 (patch)
tree194b30b7b8374b946f166996cb99fb95eb3b7819
parentc19384b5b296905d4988c7c684ff540a0f9d65be (diff)
futex_requeue_pi optimization
This patch provides the futex_requeue_pi functionality, which allows some threads waiting on a normal futex to be requeued on the wait-queue of a PI-futex. This provides an optimization, already used for (normal) futexes, to be used with the PI-futexes. This optimization is currently used by the glibc in pthread_broadcast, when using "normal" mutexes. With futex_requeue_pi, it can be used with PRIO_INHERIT mutexes too. Signed-off-by: Pierre Peiffer <pierre.peiffer@bull.net> Cc: Ingo Molnar <mingo@elte.hu> Cc: Ulrich Drepper <drepper@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/futex.h9
-rw-r--r--kernel/futex.c541
-rw-r--r--kernel/futex_compat.c3
-rw-r--r--kernel/rtmutex.c41
-rw-r--r--kernel/rtmutex_common.h34
5 files changed, 540 insertions, 88 deletions
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 34e54f2b8997..1bd8dfcb037b 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -17,6 +17,7 @@ union ktime;
17#define FUTEX_LOCK_PI 6 17#define FUTEX_LOCK_PI 6
18#define FUTEX_UNLOCK_PI 7 18#define FUTEX_UNLOCK_PI 7
19#define FUTEX_TRYLOCK_PI 8 19#define FUTEX_TRYLOCK_PI 8
20#define FUTEX_CMP_REQUEUE_PI 9
20 21
21/* 22/*
22 * Support for robust futexes: the kernel cleans up held futexes at 23 * Support for robust futexes: the kernel cleans up held futexes at
@@ -85,9 +86,14 @@ struct robust_list_head {
85#define FUTEX_OWNER_DIED 0x40000000 86#define FUTEX_OWNER_DIED 0x40000000
86 87
87/* 88/*
89 * Some processes have been requeued on this PI-futex
90 */
91#define FUTEX_WAITER_REQUEUED 0x20000000
92
93/*
88 * The rest of the robust-futex field is for the TID: 94 * The rest of the robust-futex field is for the TID:
89 */ 95 */
90#define FUTEX_TID_MASK 0x3fffffff 96#define FUTEX_TID_MASK 0x0fffffff
91 97
92/* 98/*
93 * This limit protects against a deliberately circular list. 99 * This limit protects against a deliberately circular list.
@@ -111,6 +117,7 @@ handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi);
111 * We set bit 0 to indicate if it's an inode-based key. 117 * We set bit 0 to indicate if it's an inode-based key.
112 */ 118 */
113union futex_key { 119union futex_key {
120 u32 __user *uaddr;
114 struct { 121 struct {
115 unsigned long pgoff; 122 unsigned long pgoff;
116 struct inode *inode; 123 struct inode *inode;
diff --git a/kernel/futex.c b/kernel/futex.c
index e1246ccbf89a..4a60ef55dab4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -53,6 +53,12 @@
53 53
54#include "rtmutex_common.h" 54#include "rtmutex_common.h"
55 55
56#ifdef CONFIG_DEBUG_RT_MUTEXES
57# include "rtmutex-debug.h"
58#else
59# include "rtmutex.h"
60#endif
61
56#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 62#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
57 63
58/* 64/*
@@ -102,6 +108,12 @@ struct futex_q {
102 /* Optional priority inheritance state: */ 108 /* Optional priority inheritance state: */
103 struct futex_pi_state *pi_state; 109 struct futex_pi_state *pi_state;
104 struct task_struct *task; 110 struct task_struct *task;
111
112 /*
113 * This waiter is used in case of requeue from a
114 * normal futex to a PI-futex
115 */
116 struct rt_mutex_waiter waiter;
105}; 117};
106 118
107/* 119/*
@@ -180,6 +192,9 @@ int get_futex_key(u32 __user *uaddr, union futex_key *key)
180 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 192 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
181 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 193 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
182 194
195 /* Save the user address in the ley */
196 key->uaddr = uaddr;
197
183 /* 198 /*
184 * Private mappings are handled in a simple way. 199 * Private mappings are handled in a simple way.
185 * 200 *
@@ -439,7 +454,8 @@ void exit_pi_state_list(struct task_struct *curr)
439} 454}
440 455
441static int 456static int
442lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) 457lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
458 union futex_key *key, struct futex_pi_state **ps)
443{ 459{
444 struct futex_pi_state *pi_state = NULL; 460 struct futex_pi_state *pi_state = NULL;
445 struct futex_q *this, *next; 461 struct futex_q *this, *next;
@@ -450,7 +466,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
450 head = &hb->chain; 466 head = &hb->chain;
451 467
452 plist_for_each_entry_safe(this, next, head, list) { 468 plist_for_each_entry_safe(this, next, head, list) {
453 if (match_futex(&this->key, &me->key)) { 469 if (match_futex(&this->key, key)) {
454 /* 470 /*
455 * Another waiter already exists - bump up 471 * Another waiter already exists - bump up
456 * the refcount and return its pi_state: 472 * the refcount and return its pi_state:
@@ -465,7 +481,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
465 WARN_ON(!atomic_read(&pi_state->refcount)); 481 WARN_ON(!atomic_read(&pi_state->refcount));
466 482
467 atomic_inc(&pi_state->refcount); 483 atomic_inc(&pi_state->refcount);
468 me->pi_state = pi_state; 484 *ps = pi_state;
469 485
470 return 0; 486 return 0;
471 } 487 }
@@ -492,7 +508,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
492 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); 508 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
493 509
494 /* Store the key for possible exit cleanups: */ 510 /* Store the key for possible exit cleanups: */
495 pi_state->key = me->key; 511 pi_state->key = *key;
496 512
497 spin_lock_irq(&p->pi_lock); 513 spin_lock_irq(&p->pi_lock);
498 WARN_ON(!list_empty(&pi_state->list)); 514 WARN_ON(!list_empty(&pi_state->list));
@@ -502,7 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
502 518
503 put_task_struct(p); 519 put_task_struct(p);
504 520
505 me->pi_state = pi_state; 521 *ps = pi_state;
506 522
507 return 0; 523 return 0;
508} 524}
@@ -562,6 +578,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
562 */ 578 */
563 if (!(uval & FUTEX_OWNER_DIED)) { 579 if (!(uval & FUTEX_OWNER_DIED)) {
564 newval = FUTEX_WAITERS | new_owner->pid; 580 newval = FUTEX_WAITERS | new_owner->pid;
581 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
582 newval |= (uval & FUTEX_WAITER_REQUEUED);
565 583
566 pagefault_disable(); 584 pagefault_disable();
567 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 585 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -666,6 +684,254 @@ out:
666} 684}
667 685
668/* 686/*
687 * Called from futex_requeue_pi.
688 * Set FUTEX_WAITERS and FUTEX_WAITER_REQUEUED flags on the
689 * PI-futex value; search its associated pi_state if an owner exist
690 * or create a new one without owner.
691 */
692static inline int
693lookup_pi_state_for_requeue(u32 __user *uaddr, struct futex_hash_bucket *hb,
694 union futex_key *key,
695 struct futex_pi_state **pi_state)
696{
697 u32 curval, uval, newval;
698
699retry:
700 /*
701 * We can't handle a fault cleanly because we can't
702 * release the locks here. Simply return the fault.
703 */
704 if (get_futex_value_locked(&curval, uaddr))
705 return -EFAULT;
706
707 /* set the flags FUTEX_WAITERS and FUTEX_WAITER_REQUEUED */
708 if ((curval & (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED))
709 != (FUTEX_WAITERS | FUTEX_WAITER_REQUEUED)) {
710 /*
711 * No waiters yet, we prepare the futex to have some waiters.
712 */
713
714 uval = curval;
715 newval = uval | FUTEX_WAITERS | FUTEX_WAITER_REQUEUED;
716
717 pagefault_disable();
718 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
719 pagefault_enable();
720
721 if (unlikely(curval == -EFAULT))
722 return -EFAULT;
723 if (unlikely(curval != uval))
724 goto retry;
725 }
726
727 if (!(curval & FUTEX_TID_MASK)
728 || lookup_pi_state(curval, hb, key, pi_state)) {
729 /* the futex has no owner (yet) or the lookup failed:
730 allocate one pi_state without owner */
731
732 *pi_state = alloc_pi_state();
733
734 /* Already stores the key: */
735 (*pi_state)->key = *key;
736
737 /* init the mutex without owner */
738 __rt_mutex_init(&(*pi_state)->pi_mutex, NULL);
739 }
740
741 return 0;
742}
743
744/*
745 * Keep the first nr_wake waiter from futex1, wake up one,
746 * and requeue the next nr_requeue waiters following hashed on
747 * one physical page to another physical page (PI-futex uaddr2)
748 */
749static int futex_requeue_pi(u32 __user *uaddr1, u32 __user *uaddr2,
750 int nr_wake, int nr_requeue, u32 *cmpval)
751{
752 union futex_key key1, key2;
753 struct futex_hash_bucket *hb1, *hb2;
754 struct plist_head *head1;
755 struct futex_q *this, *next;
756 struct futex_pi_state *pi_state2 = NULL;
757 struct rt_mutex_waiter *waiter, *top_waiter = NULL;
758 struct rt_mutex *lock2 = NULL;
759 int ret, drop_count = 0;
760
761 if (refill_pi_state_cache())
762 return -ENOMEM;
763
764retry:
765 /*
766 * First take all the futex related locks:
767 */
768 down_read(&current->mm->mmap_sem);
769
770 ret = get_futex_key(uaddr1, &key1);
771 if (unlikely(ret != 0))
772 goto out;
773 ret = get_futex_key(uaddr2, &key2);
774 if (unlikely(ret != 0))
775 goto out;
776
777 hb1 = hash_futex(&key1);
778 hb2 = hash_futex(&key2);
779
780 double_lock_hb(hb1, hb2);
781
782 if (likely(cmpval != NULL)) {
783 u32 curval;
784
785 ret = get_futex_value_locked(&curval, uaddr1);
786
787 if (unlikely(ret)) {
788 spin_unlock(&hb1->lock);
789 if (hb1 != hb2)
790 spin_unlock(&hb2->lock);
791
792 /*
793 * If we would have faulted, release mmap_sem, fault
794 * it in and start all over again.
795 */
796 up_read(&current->mm->mmap_sem);
797
798 ret = get_user(curval, uaddr1);
799
800 if (!ret)
801 goto retry;
802
803 return ret;
804 }
805 if (curval != *cmpval) {
806 ret = -EAGAIN;
807 goto out_unlock;
808 }
809 }
810
811 head1 = &hb1->chain;
812 plist_for_each_entry_safe(this, next, head1, list) {
813 if (!match_futex (&this->key, &key1))
814 continue;
815 if (++ret <= nr_wake) {
816 wake_futex(this);
817 } else {
818 /*
819 * FIRST: get and set the pi_state
820 */
821 if (!pi_state2) {
822 int s;
823 /* do this only the first time we requeue someone */
824 s = lookup_pi_state_for_requeue(uaddr2, hb2,
825 &key2, &pi_state2);
826 if (s) {
827 ret = s;
828 goto out_unlock;
829 }
830
831 lock2 = &pi_state2->pi_mutex;
832 spin_lock(&lock2->wait_lock);
833
834 /* Save the top waiter of the wait_list */
835 if (rt_mutex_has_waiters(lock2))
836 top_waiter = rt_mutex_top_waiter(lock2);
837 } else
838 atomic_inc(&pi_state2->refcount);
839
840
841 this->pi_state = pi_state2;
842
843 /*
844 * SECOND: requeue futex_q to the correct hashbucket
845 */
846
847 /*
848 * If key1 and key2 hash to the same bucket, no need to
849 * requeue.
850 */
851 if (likely(head1 != &hb2->chain)) {
852 plist_del(&this->list, &hb1->chain);
853 plist_add(&this->list, &hb2->chain);
854 this->lock_ptr = &hb2->lock;
855#ifdef CONFIG_DEBUG_PI_LIST
856 this->list.plist.lock = &hb2->lock;
857#endif
858 }
859 this->key = key2;
860 get_futex_key_refs(&key2);
861 drop_count++;
862
863
864 /*
865 * THIRD: queue it to lock2
866 */
867 spin_lock_irq(&this->task->pi_lock);
868 waiter = &this->waiter;
869 waiter->task = this->task;
870 waiter->lock = lock2;
871 plist_node_init(&waiter->list_entry, this->task->prio);
872 plist_node_init(&waiter->pi_list_entry, this->task->prio);
873 plist_add(&waiter->list_entry, &lock2->wait_list);
874 this->task->pi_blocked_on = waiter;
875 spin_unlock_irq(&this->task->pi_lock);
876
877 if (ret - nr_wake >= nr_requeue)
878 break;
879 }
880 }
881
882 /* If we've requeued some tasks and the top_waiter of the rt_mutex
883 has changed, we must adjust the priority of the owner, if any */
884 if (drop_count) {
885 struct task_struct *owner = rt_mutex_owner(lock2);
886 if (owner &&
887 (top_waiter != (waiter = rt_mutex_top_waiter(lock2)))) {
888 int chain_walk = 0;
889
890 spin_lock_irq(&owner->pi_lock);
891 if (top_waiter)
892 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
893 else
894 /*
895 * There was no waiters before the requeue,
896 * the flag must be updated
897 */
898 mark_rt_mutex_waiters(lock2);
899
900 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
901 __rt_mutex_adjust_prio(owner);
902 if (owner->pi_blocked_on) {
903 chain_walk = 1;
904 get_task_struct(owner);
905 }
906
907 spin_unlock_irq(&owner->pi_lock);
908 spin_unlock(&lock2->wait_lock);
909
910 if (chain_walk)
911 rt_mutex_adjust_prio_chain(owner, 0, lock2, NULL,
912 current);
913 } else {
914 /* No owner or the top_waiter does not change */
915 mark_rt_mutex_waiters(lock2);
916 spin_unlock(&lock2->wait_lock);
917 }
918 }
919
920out_unlock:
921 spin_unlock(&hb1->lock);
922 if (hb1 != hb2)
923 spin_unlock(&hb2->lock);
924
925 /* drop_futex_key_refs() must be called outside the spinlocks. */
926 while (--drop_count >= 0)
927 drop_futex_key_refs(&key1);
928
929out:
930 up_read(&current->mm->mmap_sem);
931 return ret;
932}
933
934/*
669 * Wake up all waiters hashed on the physical page that is mapped 935 * Wake up all waiters hashed on the physical page that is mapped
670 * to this virtual address: 936 * to this virtual address:
671 */ 937 */
@@ -984,9 +1250,10 @@ static int unqueue_me(struct futex_q *q)
984 1250
985/* 1251/*
986 * PI futexes can not be requeued and must remove themself from the 1252 * PI futexes can not be requeued and must remove themself from the
987 * hash bucket. The hash bucket lock is held on entry and dropped here. 1253 * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1254 * and dropped here.
988 */ 1255 */
989static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) 1256static void unqueue_me_pi(struct futex_q *q)
990{ 1257{
991 WARN_ON(plist_node_empty(&q->list)); 1258 WARN_ON(plist_node_empty(&q->list));
992 plist_del(&q->list, &q->list.plist); 1259 plist_del(&q->list, &q->list.plist);
@@ -995,11 +1262,65 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
995 free_pi_state(q->pi_state); 1262 free_pi_state(q->pi_state);
996 q->pi_state = NULL; 1263 q->pi_state = NULL;
997 1264
998 spin_unlock(&hb->lock); 1265 spin_unlock(q->lock_ptr);
999 1266
1000 drop_futex_key_refs(&q->key); 1267 drop_futex_key_refs(&q->key);
1001} 1268}
1002 1269
1270/*
1271 * Fixup the pi_state owner with current.
1272 *
1273 * The cur->mm semaphore must be held, it is released at return of this
1274 * function.
1275 */
1276static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1277 struct futex_hash_bucket *hb,
1278 struct task_struct *curr)
1279{
1280 u32 newtid = curr->pid | FUTEX_WAITERS;
1281 struct futex_pi_state *pi_state = q->pi_state;
1282 u32 uval, curval, newval;
1283 int ret;
1284
1285 /* Owner died? */
1286 if (pi_state->owner != NULL) {
1287 spin_lock_irq(&pi_state->owner->pi_lock);
1288 WARN_ON(list_empty(&pi_state->list));
1289 list_del_init(&pi_state->list);
1290 spin_unlock_irq(&pi_state->owner->pi_lock);
1291 } else
1292 newtid |= FUTEX_OWNER_DIED;
1293
1294 pi_state->owner = curr;
1295
1296 spin_lock_irq(&curr->pi_lock);
1297 WARN_ON(!list_empty(&pi_state->list));
1298 list_add(&pi_state->list, &curr->pi_state_list);
1299 spin_unlock_irq(&curr->pi_lock);
1300
1301 /* Unqueue and drop the lock */
1302 unqueue_me_pi(q);
1303 up_read(&curr->mm->mmap_sem);
1304 /*
1305 * We own it, so we have to replace the pending owner
1306 * TID. This must be atomic as we have preserve the
1307 * owner died bit here.
1308 */
1309 ret = get_user(uval, uaddr);
1310 while (!ret) {
1311 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1312 newval |= (uval & FUTEX_WAITER_REQUEUED);
1313 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1314 uval, newval);
1315 if (curval == -EFAULT)
1316 ret = -EFAULT;
1317 if (curval == uval)
1318 break;
1319 uval = curval;
1320 }
1321 return ret;
1322}
1323
1003static long futex_wait_restart(struct restart_block *restart); 1324static long futex_wait_restart(struct restart_block *restart);
1004static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time) 1325static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1005{ 1326{
@@ -1009,7 +1330,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1009 struct futex_q q; 1330 struct futex_q q;
1010 u32 uval; 1331 u32 uval;
1011 int ret; 1332 int ret;
1012 struct hrtimer_sleeper t; 1333 struct hrtimer_sleeper t, *to = NULL;
1013 int rem = 0; 1334 int rem = 0;
1014 1335
1015 q.pi_state = NULL; 1336 q.pi_state = NULL;
@@ -1063,6 +1384,14 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1063 if (uval != val) 1384 if (uval != val)
1064 goto out_unlock_release_sem; 1385 goto out_unlock_release_sem;
1065 1386
1387 /*
1388 * This rt_mutex_waiter structure is prepared here and will
1389 * be used only if this task is requeued from a normal futex to
1390 * a PI-futex with futex_requeue_pi.
1391 */
1392 debug_rt_mutex_init_waiter(&q.waiter);
1393 q.waiter.task = NULL;
1394
1066 /* Only actually queue if *uaddr contained val. */ 1395 /* Only actually queue if *uaddr contained val. */
1067 __queue_me(&q, hb); 1396 __queue_me(&q, hb);
1068 1397
@@ -1092,6 +1421,7 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1092 if (!abs_time) 1421 if (!abs_time)
1093 schedule(); 1422 schedule();
1094 else { 1423 else {
1424 to = &t;
1095 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1425 hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1096 hrtimer_init_sleeper(&t, current); 1426 hrtimer_init_sleeper(&t, current);
1097 t.timer.expires = *abs_time; 1427 t.timer.expires = *abs_time;
@@ -1119,6 +1449,66 @@ static int futex_wait(u32 __user *uaddr, u32 val, ktime_t *abs_time)
1119 * we are the only user of it. 1449 * we are the only user of it.
1120 */ 1450 */
1121 1451
1452 if (q.pi_state) {
1453 /*
1454 * We were woken but have been requeued on a PI-futex.
1455 * We have to complete the lock acquisition by taking
1456 * the rtmutex.
1457 */
1458
1459 struct rt_mutex *lock = &q.pi_state->pi_mutex;
1460
1461 spin_lock(&lock->wait_lock);
1462 if (unlikely(q.waiter.task)) {
1463 remove_waiter(lock, &q.waiter);
1464 }
1465 spin_unlock(&lock->wait_lock);
1466
1467 if (rem)
1468 ret = -ETIMEDOUT;
1469 else
1470 ret = rt_mutex_timed_lock(lock, to, 1);
1471
1472 down_read(&curr->mm->mmap_sem);
1473 spin_lock(q.lock_ptr);
1474
1475 /*
1476 * Got the lock. We might not be the anticipated owner if we
1477 * did a lock-steal - fix up the PI-state in that case.
1478 */
1479 if (!ret && q.pi_state->owner != curr) {
1480 /*
1481 * We MUST play with the futex we were requeued on,
1482 * NOT the current futex.
1483 * We can retrieve it from the key of the pi_state
1484 */
1485 uaddr = q.pi_state->key.uaddr;
1486
1487 /* mmap_sem and hash_bucket lock are unlocked at
1488 return of this function */
1489 ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
1490 } else {
1491 /*
1492 * Catch the rare case, where the lock was released
1493 * when we were on the way back before we locked
1494 * the hash bucket.
1495 */
1496 if (ret && q.pi_state->owner == curr) {
1497 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1498 ret = 0;
1499 }
1500 /* Unqueue and drop the lock */
1501 unqueue_me_pi(&q);
1502 up_read(&curr->mm->mmap_sem);
1503 }
1504
1505 debug_rt_mutex_free_waiter(&q.waiter);
1506
1507 return ret;
1508 }
1509
1510 debug_rt_mutex_free_waiter(&q.waiter);
1511
1122 /* If we were woken (and unqueued), we succeeded, whatever. */ 1512 /* If we were woken (and unqueued), we succeeded, whatever. */
1123 if (!unqueue_me(&q)) 1513 if (!unqueue_me(&q))
1124 return 0; 1514 return 0;
@@ -1161,6 +1551,51 @@ static long futex_wait_restart(struct restart_block *restart)
1161} 1551}
1162 1552
1163 1553
1554static void set_pi_futex_owner(struct futex_hash_bucket *hb,
1555 union futex_key *key, struct task_struct *p)
1556{
1557 struct plist_head *head;
1558 struct futex_q *this, *next;
1559 struct futex_pi_state *pi_state = NULL;
1560 struct rt_mutex *lock;
1561
1562 /* Search a waiter that should already exists */
1563
1564 head = &hb->chain;
1565
1566 plist_for_each_entry_safe(this, next, head, list) {
1567 if (match_futex (&this->key, key)) {
1568 pi_state = this->pi_state;
1569 break;
1570 }
1571 }
1572
1573 BUG_ON(!pi_state);
1574
1575 /* set p as pi_state's owner */
1576 lock = &pi_state->pi_mutex;
1577
1578 spin_lock(&lock->wait_lock);
1579 spin_lock_irq(&p->pi_lock);
1580
1581 list_add(&pi_state->list, &p->pi_state_list);
1582 pi_state->owner = p;
1583
1584
1585 /* set p as pi_mutex's owner */
1586 debug_rt_mutex_proxy_lock(lock, p);
1587 WARN_ON(rt_mutex_owner(lock));
1588 rt_mutex_set_owner(lock, p, 0);
1589 rt_mutex_deadlock_account_lock(lock, p);
1590
1591 plist_add(&rt_mutex_top_waiter(lock)->pi_list_entry,
1592 &p->pi_waiters);
1593 __rt_mutex_adjust_prio(p);
1594
1595 spin_unlock_irq(&p->pi_lock);
1596 spin_unlock(&lock->wait_lock);
1597}
1598
1164/* 1599/*
1165 * Userspace tried a 0 -> TID atomic transition of the futex value 1600 * Userspace tried a 0 -> TID atomic transition of the futex value
1166 * and failed. The kernel side here does the whole locking operation: 1601 * and failed. The kernel side here does the whole locking operation:
@@ -1175,7 +1610,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1175 struct futex_hash_bucket *hb; 1610 struct futex_hash_bucket *hb;
1176 u32 uval, newval, curval; 1611 u32 uval, newval, curval;
1177 struct futex_q q; 1612 struct futex_q q;
1178 int ret, attempt = 0; 1613 int ret, lock_held, attempt = 0;
1179 1614
1180 if (refill_pi_state_cache()) 1615 if (refill_pi_state_cache())
1181 return -ENOMEM; 1616 return -ENOMEM;
@@ -1198,6 +1633,8 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1198 hb = queue_lock(&q, -1, NULL); 1633 hb = queue_lock(&q, -1, NULL);
1199 1634
1200 retry_locked: 1635 retry_locked:
1636 lock_held = 0;
1637
1201 /* 1638 /*
1202 * To avoid races, we attempt to take the lock here again 1639 * To avoid races, we attempt to take the lock here again
1203 * (by doing a 0 -> TID atomic cmpxchg), while holding all 1640 * (by doing a 0 -> TID atomic cmpxchg), while holding all
@@ -1216,7 +1653,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1216 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1653 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1217 if (!detect && 0) 1654 if (!detect && 0)
1218 force_sig(SIGKILL, current); 1655 force_sig(SIGKILL, current);
1219 ret = -EDEADLK; 1656 /*
1657 * Normally, this check is done in user space.
1658 * In case of requeue, the owner may attempt to lock this futex,
1659 * even if the ownership has already been given by the previous
1660 * waker.
1661 * In the usual case, this is a case of deadlock, but not in case
1662 * of REQUEUE_PI.
1663 */
1664 if (!(curval & FUTEX_WAITER_REQUEUED))
1665 ret = -EDEADLK;
1220 goto out_unlock_release_sem; 1666 goto out_unlock_release_sem;
1221 } 1667 }
1222 1668
@@ -1228,7 +1674,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1228 goto out_unlock_release_sem; 1674 goto out_unlock_release_sem;
1229 1675
1230 uval = curval; 1676 uval = curval;
1231 newval = uval | FUTEX_WAITERS; 1677 /*
1678 * In case of a requeue, check if there already is an owner
1679 * If not, just take the futex.
1680 */
1681 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) {
1682 /* set current as futex owner */
1683 newval = curval | current->pid;
1684 lock_held = 1;
1685 } else
1686 /* Set the WAITERS flag, so the owner will know it has someone
1687 to wake at next unlock */
1688 newval = curval | FUTEX_WAITERS;
1232 1689
1233 pagefault_disable(); 1690 pagefault_disable();
1234 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1691 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1239,11 +1696,16 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1239 if (unlikely(curval != uval)) 1696 if (unlikely(curval != uval))
1240 goto retry_locked; 1697 goto retry_locked;
1241 1698
1699 if (lock_held) {
1700 set_pi_futex_owner(hb, &q.key, curr);
1701 goto out_unlock_release_sem;
1702 }
1703
1242 /* 1704 /*
1243 * We dont have the lock. Look up the PI state (or create it if 1705 * We dont have the lock. Look up the PI state (or create it if
1244 * we are the first waiter): 1706 * we are the first waiter):
1245 */ 1707 */
1246 ret = lookup_pi_state(uval, hb, &q); 1708 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1247 1709
1248 if (unlikely(ret)) { 1710 if (unlikely(ret)) {
1249 /* 1711 /*
@@ -1306,45 +1768,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1306 * Got the lock. We might not be the anticipated owner if we 1768 * Got the lock. We might not be the anticipated owner if we
1307 * did a lock-steal - fix up the PI-state in that case. 1769 * did a lock-steal - fix up the PI-state in that case.
1308 */ 1770 */
1309 if (!ret && q.pi_state->owner != curr) { 1771 if (!ret && q.pi_state->owner != curr)
1310 u32 newtid = current->pid | FUTEX_WAITERS; 1772 /* mmap_sem is unlocked at return of this function */
1311 1773 ret = fixup_pi_state_owner(uaddr, &q, hb, curr);
1312 /* Owner died? */ 1774 else {
1313 if (q.pi_state->owner != NULL) {
1314 spin_lock_irq(&q.pi_state->owner->pi_lock);
1315 WARN_ON(list_empty(&q.pi_state->list));
1316 list_del_init(&q.pi_state->list);
1317 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1318 } else
1319 newtid |= FUTEX_OWNER_DIED;
1320
1321 q.pi_state->owner = current;
1322
1323 spin_lock_irq(&current->pi_lock);
1324 WARN_ON(!list_empty(&q.pi_state->list));
1325 list_add(&q.pi_state->list, &current->pi_state_list);
1326 spin_unlock_irq(&current->pi_lock);
1327
1328 /* Unqueue and drop the lock */
1329 unqueue_me_pi(&q, hb);
1330 up_read(&curr->mm->mmap_sem);
1331 /*
1332 * We own it, so we have to replace the pending owner
1333 * TID. This must be atomic as we have preserve the
1334 * owner died bit here.
1335 */
1336 ret = get_user(uval, uaddr);
1337 while (!ret) {
1338 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1339 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1340 uval, newval);
1341 if (curval == -EFAULT)
1342 ret = -EFAULT;
1343 if (curval == uval)
1344 break;
1345 uval = curval;
1346 }
1347 } else {
1348 /* 1775 /*
1349 * Catch the rare case, where the lock was released 1776 * Catch the rare case, where the lock was released
1350 * when we were on the way back before we locked 1777 * when we were on the way back before we locked
@@ -1355,7 +1782,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, ktime_t *time,
1355 ret = 0; 1782 ret = 0;
1356 } 1783 }
1357 /* Unqueue and drop the lock */ 1784 /* Unqueue and drop the lock */
1358 unqueue_me_pi(&q, hb); 1785 unqueue_me_pi(&q);
1359 up_read(&curr->mm->mmap_sem); 1786 up_read(&curr->mm->mmap_sem);
1360 } 1787 }
1361 1788
@@ -1724,6 +2151,8 @@ retry:
1724 * userspace. 2151 * userspace.
1725 */ 2152 */
1726 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; 2153 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2154 /* Also keep the FUTEX_WAITER_REQUEUED flag if set */
2155 mval |= (uval & FUTEX_WAITER_REQUEUED);
1727 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); 2156 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1728 2157
1729 if (nval == -EFAULT) 2158 if (nval == -EFAULT)
@@ -1854,6 +2283,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1854 case FUTEX_TRYLOCK_PI: 2283 case FUTEX_TRYLOCK_PI:
1855 ret = futex_lock_pi(uaddr, 0, timeout, 1); 2284 ret = futex_lock_pi(uaddr, 0, timeout, 1);
1856 break; 2285 break;
2286 case FUTEX_CMP_REQUEUE_PI:
2287 ret = futex_requeue_pi(uaddr, uaddr2, val, val2, &val3);
2288 break;
1857 default: 2289 default:
1858 ret = -ENOSYS; 2290 ret = -ENOSYS;
1859 } 2291 }
@@ -1883,7 +2315,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1883 /* 2315 /*
1884 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 2316 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1885 */ 2317 */
1886 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 2318 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
2319 || op == FUTEX_CMP_REQUEUE_PI)
1887 val2 = (u32) (unsigned long) utime; 2320 val2 = (u32) (unsigned long) utime;
1888 2321
1889 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 2322 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index dff27c471ea6..338a9b489fbc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -156,7 +156,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
156 t = ktime_add(ktime_get(), t); 156 t = ktime_add(ktime_get(), t);
157 tp = &t; 157 tp = &t;
158 } 158 }
159 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) 159 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE
160 || op == FUTEX_CMP_REQUEUE_PI)
160 val2 = (int) (unsigned long) utime; 161 val2 = (int) (unsigned long) utime;
161 162
162 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 163 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 180978cb2f75..12879f6c1ec3 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -56,7 +56,7 @@
56 * state. 56 * state.
57 */ 57 */
58 58
59static void 59void
60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, 60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
61 unsigned long mask) 61 unsigned long mask)
62{ 62{
@@ -81,29 +81,6 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
81} 81}
82 82
83/* 83/*
84 * We can speed up the acquire/release, if the architecture
85 * supports cmpxchg and if there's no debugging state to be set up
86 */
87#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
88# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
89static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90{
91 unsigned long owner, *p = (unsigned long *) &lock->owner;
92
93 do {
94 owner = *p;
95 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
96}
97#else
98# define rt_mutex_cmpxchg(l,c,n) (0)
99static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
100{
101 lock->owner = (struct task_struct *)
102 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
103}
104#endif
105
106/*
107 * Calculate task priority from the waiter list priority 84 * Calculate task priority from the waiter list priority
108 * 85 *
109 * Return task->normal_prio when the waiter list is empty or when 86 * Return task->normal_prio when the waiter list is empty or when
@@ -123,7 +100,7 @@ int rt_mutex_getprio(struct task_struct *task)
123 * 100 *
124 * This can be both boosting and unboosting. task->pi_lock must be held. 101 * This can be both boosting and unboosting. task->pi_lock must be held.
125 */ 102 */
126static void __rt_mutex_adjust_prio(struct task_struct *task) 103void __rt_mutex_adjust_prio(struct task_struct *task)
127{ 104{
128 int prio = rt_mutex_getprio(task); 105 int prio = rt_mutex_getprio(task);
129 106
@@ -159,11 +136,11 @@ int max_lock_depth = 1024;
159 * Decreases task's usage by one - may thus free the task. 136 * Decreases task's usage by one - may thus free the task.
160 * Returns 0 or -EDEADLK. 137 * Returns 0 or -EDEADLK.
161 */ 138 */
162static int rt_mutex_adjust_prio_chain(struct task_struct *task, 139int rt_mutex_adjust_prio_chain(struct task_struct *task,
163 int deadlock_detect, 140 int deadlock_detect,
164 struct rt_mutex *orig_lock, 141 struct rt_mutex *orig_lock,
165 struct rt_mutex_waiter *orig_waiter, 142 struct rt_mutex_waiter *orig_waiter,
166 struct task_struct *top_task) 143 struct task_struct *top_task)
167{ 144{
168 struct rt_mutex *lock; 145 struct rt_mutex *lock;
169 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; 146 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
@@ -524,8 +501,8 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
524 * 501 *
525 * Must be called with lock->wait_lock held 502 * Must be called with lock->wait_lock held
526 */ 503 */
527static void remove_waiter(struct rt_mutex *lock, 504void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter) 505 struct rt_mutex_waiter *waiter)
529{ 506{
530 int first = (waiter == rt_mutex_top_waiter(lock)); 507 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock); 508 struct task_struct *owner = rt_mutex_owner(lock);
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 9c75856e791e..242ec7ee740b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -113,6 +113,29 @@ static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
113} 113}
114 114
115/* 115/*
116 * We can speed up the acquire/release, if the architecture
117 * supports cmpxchg and if there's no debugging state to be set up
118 */
119#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
120# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
121static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
122{
123 unsigned long owner, *p = (unsigned long *) &lock->owner;
124
125 do {
126 owner = *p;
127 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
128}
129#else
130# define rt_mutex_cmpxchg(l,c,n) (0)
131static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
132{
133 lock->owner = (struct task_struct *)
134 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
135}
136#endif
137
138/*
116 * PI-futex support (proxy locking functions, etc.): 139 * PI-futex support (proxy locking functions, etc.):
117 */ 140 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); 141extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -120,4 +143,15 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner); 143 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, 144extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner); 145 struct task_struct *proxy_owner);
146
147extern void rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
148 unsigned long mask);
149extern void __rt_mutex_adjust_prio(struct task_struct *task);
150extern int rt_mutex_adjust_prio_chain(struct task_struct *task,
151 int deadlock_detect,
152 struct rt_mutex *orig_lock,
153 struct rt_mutex_waiter *orig_waiter,
154 struct task_struct *top_task);
155extern void remove_waiter(struct rt_mutex *lock,
156 struct rt_mutex_waiter *waiter);
123#endif 157#endif