aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAlexey Kuznetsov <kuznet@ms2.inr.ac.ru>2007-06-08 16:47:00 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-06-08 20:23:34 -0400
commit778e9a9c3e7193ea9f434f382947155ffb59c755 (patch)
tree2ceb8c7ce1d55124982b77966dcd65cee5cc623b /kernel
parent1a539a87280b3032fd12bc93a4a82f1d8aa97ca8 (diff)
pi-futex: fix exit races and locking problems
1. New entries can be added to tsk->pi_state_list after task completed exit_pi_state_list(). The result is memory leakage and deadlocks. 2. handle_mm_fault() is called under spinlock. The result is obvious. 3. results in self-inflicted deadlock inside glibc. Sometimes futex_lock_pi returns -ESRCH, when it is not expected and glibc enters to for(;;) sleep() to simulate deadlock. This problem is quite obvious and I think the patch is right. Though it looks like each "if" in futex_lock_pi() got some stupid special case "else if". :-) 4. sometimes futex_lock_pi() returns -EDEADLK, when nobody has the lock. The reason is also obvious (see comment in the patch), but correct fix is far beyond my comprehension. I guess someone already saw this, the chunk: if (rt_mutex_trylock(&q.pi_state->pi_mutex)) ret = 0; is obviously from the same opera. But it does not work, because the rtmutex is really taken at this point: wake_futex_pi() of previous owner reassigned it to us. My fix works. But it looks very stupid. I would think about removal of shift of ownership in wake_futex_pi() and making all the work in context of process taking lock. From: Thomas Gleixner <tglx@linutronix.de> Fix 1) Avoid the tasklist lock variant of the exit race fix by adding an additional state transition to the exit code. This fixes also the issue, when a task with recursive segfaults is not able to release the futexes. Fix 2) Cleanup the lookup_pi_state() failure path and solve the -ESRCH problem finally. Fix 3) Solve the fixup_pi_state_owner() problem which needs to do the fixup in the lock protected section by using the in_atomic userspace access functions. This removes also the ugly lock drop / unqueue inside of fixup_pi_state() Fix 4) Fix a stale lock in the error path of futex_wake_pi() Added some error checks for verification. The -EDEADLK problem is solved by the rtmutex fixups. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/exit.c24
-rw-r--r--kernel/futex.c269
2 files changed, 182 insertions, 111 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b888c24e43e..5c8ecbaa19a5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -892,13 +892,29 @@ fastcall NORET_TYPE void do_exit(long code)
892 if (unlikely(tsk->flags & PF_EXITING)) { 892 if (unlikely(tsk->flags & PF_EXITING)) {
893 printk(KERN_ALERT 893 printk(KERN_ALERT
894 "Fixing recursive fault but reboot is needed!\n"); 894 "Fixing recursive fault but reboot is needed!\n");
895 /*
896 * We can do this unlocked here. The futex code uses
897 * this flag just to verify whether the pi state
898 * cleanup has been done or not. In the worst case it
899 * loops once more. We pretend that the cleanup was
900 * done as there is no way to return. Either the
901 * OWNER_DIED bit is set by now or we push the blocked
902 * task into the wait for ever nirwana as well.
903 */
904 tsk->flags |= PF_EXITPIDONE;
895 if (tsk->io_context) 905 if (tsk->io_context)
896 exit_io_context(); 906 exit_io_context();
897 set_current_state(TASK_UNINTERRUPTIBLE); 907 set_current_state(TASK_UNINTERRUPTIBLE);
898 schedule(); 908 schedule();
899 } 909 }
900 910
911 /*
912 * tsk->flags are checked in the futex code to protect against
913 * an exiting task cleaning up the robust pi futexes.
914 */
915 spin_lock_irq(&tsk->pi_lock);
901 tsk->flags |= PF_EXITING; 916 tsk->flags |= PF_EXITING;
917 spin_unlock_irq(&tsk->pi_lock);
902 918
903 if (unlikely(in_atomic())) 919 if (unlikely(in_atomic()))
904 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 920 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -912,7 +928,7 @@ fastcall NORET_TYPE void do_exit(long code)
912 } 928 }
913 group_dead = atomic_dec_and_test(&tsk->signal->live); 929 group_dead = atomic_dec_and_test(&tsk->signal->live);
914 if (group_dead) { 930 if (group_dead) {
915 hrtimer_cancel(&tsk->signal->real_timer); 931 hrtimer_cancel(&tsk->signal->real_timer);
916 exit_itimers(tsk->signal); 932 exit_itimers(tsk->signal);
917 } 933 }
918 acct_collect(code, group_dead); 934 acct_collect(code, group_dead);
@@ -965,6 +981,12 @@ fastcall NORET_TYPE void do_exit(long code)
965 * Make sure we are holding no locks: 981 * Make sure we are holding no locks:
966 */ 982 */
967 debug_check_no_locks_held(tsk); 983 debug_check_no_locks_held(tsk);
984 /*
985 * We can do this unlocked here. The futex code uses this flag
986 * just to verify whether the pi state cleanup has been done
987 * or not. In the worst case it loops once more.
988 */
989 tsk->flags |= PF_EXITPIDONE;
968 990
969 if (tsk->io_context) 991 if (tsk->io_context)
970 exit_io_context(); 992 exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c
index b7ce15c67e32..3b7f7713d9a4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -430,10 +430,6 @@ static struct task_struct * futex_find_get_task(pid_t pid)
430 p = NULL; 430 p = NULL;
431 goto out_unlock; 431 goto out_unlock;
432 } 432 }
433 if (p->exit_state != 0) {
434 p = NULL;
435 goto out_unlock;
436 }
437 get_task_struct(p); 433 get_task_struct(p);
438out_unlock: 434out_unlock:
439 rcu_read_unlock(); 435 rcu_read_unlock();
@@ -502,7 +498,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
502 struct futex_q *this, *next; 498 struct futex_q *this, *next;
503 struct plist_head *head; 499 struct plist_head *head;
504 struct task_struct *p; 500 struct task_struct *p;
505 pid_t pid; 501 pid_t pid = uval & FUTEX_TID_MASK;
506 502
507 head = &hb->chain; 503 head = &hb->chain;
508 504
@@ -520,6 +516,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
520 return -EINVAL; 516 return -EINVAL;
521 517
522 WARN_ON(!atomic_read(&pi_state->refcount)); 518 WARN_ON(!atomic_read(&pi_state->refcount));
519 WARN_ON(pid && pi_state->owner &&
520 pi_state->owner->pid != pid);
523 521
524 atomic_inc(&pi_state->refcount); 522 atomic_inc(&pi_state->refcount);
525 *ps = pi_state; 523 *ps = pi_state;
@@ -530,15 +528,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
530 528
531 /* 529 /*
532 * We are the first waiter - try to look up the real owner and attach 530 * We are the first waiter - try to look up the real owner and attach
533 * the new pi_state to it, but bail out when the owner died bit is set 531 * the new pi_state to it, but bail out when TID = 0
534 * and TID = 0:
535 */ 532 */
536 pid = uval & FUTEX_TID_MASK; 533 if (!pid)
537 if (!pid && (uval & FUTEX_OWNER_DIED))
538 return -ESRCH; 534 return -ESRCH;
539 p = futex_find_get_task(pid); 535 p = futex_find_get_task(pid);
540 if (!p) 536 if (IS_ERR(p))
541 return -ESRCH; 537 return PTR_ERR(p);
538
539 /*
540 * We need to look at the task state flags to figure out,
541 * whether the task is exiting. To protect against the do_exit
542 * change of the task flags, we do this protected by
543 * p->pi_lock:
544 */
545 spin_lock_irq(&p->pi_lock);
546 if (unlikely(p->flags & PF_EXITING)) {
547 /*
548 * The task is on the way out. When PF_EXITPIDONE is
549 * set, we know that the task has finished the
550 * cleanup:
551 */
552 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
553
554 spin_unlock_irq(&p->pi_lock);
555 put_task_struct(p);
556 return ret;
557 }
542 558
543 pi_state = alloc_pi_state(); 559 pi_state = alloc_pi_state();
544 560
@@ -551,7 +567,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
551 /* Store the key for possible exit cleanups: */ 567 /* Store the key for possible exit cleanups: */
552 pi_state->key = *key; 568 pi_state->key = *key;
553 569
554 spin_lock_irq(&p->pi_lock);
555 WARN_ON(!list_empty(&pi_state->list)); 570 WARN_ON(!list_empty(&pi_state->list));
556 list_add(&pi_state->list, &p->pi_state_list); 571 list_add(&pi_state->list, &p->pi_state_list);
557 pi_state->owner = p; 572 pi_state->owner = p;
@@ -618,6 +633,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
618 * preserve the owner died bit.) 633 * preserve the owner died bit.)
619 */ 634 */
620 if (!(uval & FUTEX_OWNER_DIED)) { 635 if (!(uval & FUTEX_OWNER_DIED)) {
636 int ret = 0;
637
621 newval = FUTEX_WAITERS | new_owner->pid; 638 newval = FUTEX_WAITERS | new_owner->pid;
622 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ 639 /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */
623 newval |= (uval & FUTEX_WAITER_REQUEUED); 640 newval |= (uval & FUTEX_WAITER_REQUEUED);
@@ -625,10 +642,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
625 pagefault_disable(); 642 pagefault_disable();
626 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 643 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
627 pagefault_enable(); 644 pagefault_enable();
645
628 if (curval == -EFAULT) 646 if (curval == -EFAULT)
629 return -EFAULT; 647 ret = -EFAULT;
630 if (curval != uval) 648 if (curval != uval)
631 return -EINVAL; 649 ret = -EINVAL;
650 if (ret) {
651 spin_unlock(&pi_state->pi_mutex.wait_lock);
652 return ret;
653 }
632 } 654 }
633 655
634 spin_lock_irq(&pi_state->owner->pi_lock); 656 spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1174,7 +1196,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
1174#ifdef CONFIG_DEBUG_PI_LIST 1196#ifdef CONFIG_DEBUG_PI_LIST
1175 this->list.plist.lock = &hb2->lock; 1197 this->list.plist.lock = &hb2->lock;
1176#endif 1198#endif
1177 } 1199 }
1178 this->key = key2; 1200 this->key = key2;
1179 get_futex_key_refs(&key2); 1201 get_futex_key_refs(&key2);
1180 drop_count++; 1202 drop_count++;
@@ -1326,12 +1348,10 @@ static void unqueue_me_pi(struct futex_q *q)
1326/* 1348/*
1327 * Fixup the pi_state owner with current. 1349 * Fixup the pi_state owner with current.
1328 * 1350 *
1329 * The cur->mm semaphore must be held, it is released at return of this 1351 * Must be called with hash bucket lock held and mm->sem held for non
1330 * function. 1352 * private futexes.
1331 */ 1353 */
1332static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, 1354static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1333 struct futex_q *q,
1334 struct futex_hash_bucket *hb,
1335 struct task_struct *curr) 1355 struct task_struct *curr)
1336{ 1356{
1337 u32 newtid = curr->pid | FUTEX_WAITERS; 1357 u32 newtid = curr->pid | FUTEX_WAITERS;
@@ -1355,23 +1375,24 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared,
1355 list_add(&pi_state->list, &curr->pi_state_list); 1375 list_add(&pi_state->list, &curr->pi_state_list);
1356 spin_unlock_irq(&curr->pi_lock); 1376 spin_unlock_irq(&curr->pi_lock);
1357 1377
1358 /* Unqueue and drop the lock */
1359 unqueue_me_pi(q);
1360 if (fshared)
1361 up_read(fshared);
1362 /* 1378 /*
1363 * We own it, so we have to replace the pending owner 1379 * We own it, so we have to replace the pending owner
1364 * TID. This must be atomic as we have preserve the 1380 * TID. This must be atomic as we have preserve the
1365 * owner died bit here. 1381 * owner died bit here.
1366 */ 1382 */
1367 ret = get_user(uval, uaddr); 1383 ret = get_futex_value_locked(&uval, uaddr);
1384
1368 while (!ret) { 1385 while (!ret) {
1369 newval = (uval & FUTEX_OWNER_DIED) | newtid; 1386 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1370 newval |= (uval & FUTEX_WAITER_REQUEUED); 1387 newval |= (uval & FUTEX_WAITER_REQUEUED);
1388
1389 pagefault_disable();
1371 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1390 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1372 uval, newval); 1391 uval, newval);
1392 pagefault_enable();
1393
1373 if (curval == -EFAULT) 1394 if (curval == -EFAULT)
1374 ret = -EFAULT; 1395 ret = -EFAULT;
1375 if (curval == uval) 1396 if (curval == uval)
1376 break; 1397 break;
1377 uval = curval; 1398 uval = curval;
@@ -1553,10 +1574,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1553 */ 1574 */
1554 uaddr = q.pi_state->key.uaddr; 1575 uaddr = q.pi_state->key.uaddr;
1555 1576
1556 /* mmap_sem and hash_bucket lock are unlocked at 1577 ret = fixup_pi_state_owner(uaddr, &q, curr);
1557 return of this function */
1558 ret = fixup_pi_state_owner(uaddr, fshared,
1559 &q, hb, curr);
1560 } else { 1578 } else {
1561 /* 1579 /*
1562 * Catch the rare case, where the lock was released 1580 * Catch the rare case, where the lock was released
@@ -1567,12 +1585,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1567 if (rt_mutex_trylock(&q.pi_state->pi_mutex)) 1585 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1568 ret = 0; 1586 ret = 0;
1569 } 1587 }
1570 /* Unqueue and drop the lock */
1571 unqueue_me_pi(&q);
1572 if (fshared)
1573 up_read(fshared);
1574 } 1588 }
1575 1589
1590 /* Unqueue and drop the lock */
1591 unqueue_me_pi(&q);
1592 if (fshared)
1593 up_read(fshared);
1594
1576 debug_rt_mutex_free_waiter(&q.waiter); 1595 debug_rt_mutex_free_waiter(&q.waiter);
1577 1596
1578 return ret; 1597 return ret;
@@ -1688,7 +1707,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1688 struct futex_hash_bucket *hb; 1707 struct futex_hash_bucket *hb;
1689 u32 uval, newval, curval; 1708 u32 uval, newval, curval;
1690 struct futex_q q; 1709 struct futex_q q;
1691 int ret, lock_held, attempt = 0; 1710 int ret, lock_taken, ownerdied = 0, attempt = 0;
1692 1711
1693 if (refill_pi_state_cache()) 1712 if (refill_pi_state_cache())
1694 return -ENOMEM; 1713 return -ENOMEM;
@@ -1709,10 +1728,11 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1709 if (unlikely(ret != 0)) 1728 if (unlikely(ret != 0))
1710 goto out_release_sem; 1729 goto out_release_sem;
1711 1730
1731 retry_unlocked:
1712 hb = queue_lock(&q, -1, NULL); 1732 hb = queue_lock(&q, -1, NULL);
1713 1733
1714 retry_locked: 1734 retry_locked:
1715 lock_held = 0; 1735 ret = lock_taken = 0;
1716 1736
1717 /* 1737 /*
1718 * To avoid races, we attempt to take the lock here again 1738 * To avoid races, we attempt to take the lock here again
@@ -1728,43 +1748,44 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1728 if (unlikely(curval == -EFAULT)) 1748 if (unlikely(curval == -EFAULT))
1729 goto uaddr_faulted; 1749 goto uaddr_faulted;
1730 1750
1731 /* We own the lock already */ 1751 /*
1752 * Detect deadlocks. In case of REQUEUE_PI this is a valid
1753 * situation and we return success to user space.
1754 */
1732 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { 1755 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1733 if (!detect && 0)
1734 force_sig(SIGKILL, current);
1735 /*
1736 * Normally, this check is done in user space.
1737 * In case of requeue, the owner may attempt to lock this futex,
1738 * even if the ownership has already been given by the previous
1739 * waker.
1740 * In the usual case, this is a case of deadlock, but not in case
1741 * of REQUEUE_PI.
1742 */
1743 if (!(curval & FUTEX_WAITER_REQUEUED)) 1756 if (!(curval & FUTEX_WAITER_REQUEUED))
1744 ret = -EDEADLK; 1757 ret = -EDEADLK;
1745 goto out_unlock_release_sem; 1758 goto out_unlock_release_sem;
1746 } 1759 }
1747 1760
1748 /* 1761 /*
1749 * Surprise - we got the lock. Just return 1762 * Surprise - we got the lock. Just return to userspace:
1750 * to userspace:
1751 */ 1763 */
1752 if (unlikely(!curval)) 1764 if (unlikely(!curval))
1753 goto out_unlock_release_sem; 1765 goto out_unlock_release_sem;
1754 1766
1755 uval = curval; 1767 uval = curval;
1768
1756 /* 1769 /*
1757 * In case of a requeue, check if there already is an owner 1770 * Set the WAITERS flag, so the owner will know it has someone
1758 * If not, just take the futex. 1771 * to wake at next unlock
1759 */ 1772 */
1760 if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { 1773 newval = curval | FUTEX_WAITERS;
1761 /* set current as futex owner */ 1774
1762 newval = curval | current->pid; 1775 /*
1763 lock_held = 1; 1776 * There are two cases, where a futex might have no owner (the
1764 } else 1777 * owner TID is 0): OWNER_DIED or REQUEUE. We take over the
1765 /* Set the WAITERS flag, so the owner will know it has someone 1778 * futex in this case. We also do an unconditional take over,
1766 to wake at next unlock */ 1779 * when the owner of the futex died.
1767 newval = curval | FUTEX_WAITERS; 1780 *
1781 * This is safe as we are protected by the hash bucket lock !
1782 */
1783 if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
1784 /* Keep the OWNER_DIED and REQUEUE bits */
1785 newval = (curval & ~FUTEX_TID_MASK) | current->pid;
1786 ownerdied = 0;
1787 lock_taken = 1;
1788 }
1768 1789
1769 pagefault_disable(); 1790 pagefault_disable();
1770 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 1791 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
@@ -1775,8 +1796,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1775 if (unlikely(curval != uval)) 1796 if (unlikely(curval != uval))
1776 goto retry_locked; 1797 goto retry_locked;
1777 1798
1778 if (lock_held) { 1799 /*
1779 set_pi_futex_owner(hb, &q.key, curr); 1800 * We took the lock due to requeue or owner died take over.
1801 */
1802 if (unlikely(lock_taken)) {
1803 /* For requeue we need to fixup the pi_futex */
1804 if (curval & FUTEX_WAITER_REQUEUED)
1805 set_pi_futex_owner(hb, &q.key, curr);
1780 goto out_unlock_release_sem; 1806 goto out_unlock_release_sem;
1781 } 1807 }
1782 1808
@@ -1787,34 +1813,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1787 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); 1813 ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
1788 1814
1789 if (unlikely(ret)) { 1815 if (unlikely(ret)) {
1790 /* 1816 switch (ret) {
1791 * There were no waiters and the owner task lookup
1792 * failed. When the OWNER_DIED bit is set, then we
1793 * know that this is a robust futex and we actually
1794 * take the lock. This is safe as we are protected by
1795 * the hash bucket lock. We also set the waiters bit
1796 * unconditionally here, to simplify glibc handling of
1797 * multiple tasks racing to acquire the lock and
1798 * cleanup the problems which were left by the dead
1799 * owner.
1800 */
1801 if (curval & FUTEX_OWNER_DIED) {
1802 uval = newval;
1803 newval = current->pid |
1804 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1805 1817
1806 pagefault_disable(); 1818 case -EAGAIN:
1807 curval = futex_atomic_cmpxchg_inatomic(uaddr, 1819 /*
1808 uval, newval); 1820 * Task is exiting and we just wait for the
1809 pagefault_enable(); 1821 * exit to complete.
1822 */
1823 queue_unlock(&q, hb);
1824 if (fshared)
1825 up_read(fshared);
1826 cond_resched();
1827 goto retry;
1810 1828
1811 if (unlikely(curval == -EFAULT)) 1829 case -ESRCH:
1830 /*
1831 * No owner found for this futex. Check if the
1832 * OWNER_DIED bit is set to figure out whether
1833 * this is a robust futex or not.
1834 */
1835 if (get_futex_value_locked(&curval, uaddr))
1812 goto uaddr_faulted; 1836 goto uaddr_faulted;
1813 if (unlikely(curval != uval)) 1837
1838 /*
1839 * We simply start over in case of a robust
1840 * futex. The code above will take the futex
1841 * and return happy.
1842 */
1843 if (curval & FUTEX_OWNER_DIED) {
1844 ownerdied = 1;
1814 goto retry_locked; 1845 goto retry_locked;
1815 ret = 0; 1846 }
1847 default:
1848 goto out_unlock_release_sem;
1816 } 1849 }
1817 goto out_unlock_release_sem;
1818 } 1850 }
1819 1851
1820 /* 1852 /*
@@ -1845,31 +1877,42 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1845 down_read(fshared); 1877 down_read(fshared);
1846 spin_lock(q.lock_ptr); 1878 spin_lock(q.lock_ptr);
1847 1879
1848 /* 1880 if (!ret) {
1849 * Got the lock. We might not be the anticipated owner if we 1881 /*
1850 * did a lock-steal - fix up the PI-state in that case. 1882 * Got the lock. We might not be the anticipated owner
1851 */ 1883 * if we did a lock-steal - fix up the PI-state in
1852 if (!ret && q.pi_state->owner != curr) 1884 * that case:
1853 /* mmap_sem is unlocked at return of this function */ 1885 */
1854 ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); 1886 if (q.pi_state->owner != curr)
1855 else { 1887 ret = fixup_pi_state_owner(uaddr, &q, curr);
1888 } else {
1856 /* 1889 /*
1857 * Catch the rare case, where the lock was released 1890 * Catch the rare case, where the lock was released
1858 * when we were on the way back before we locked 1891 * when we were on the way back before we locked the
1859 * the hash bucket. 1892 * hash bucket.
1860 */ 1893 */
1861 if (ret && q.pi_state->owner == curr) { 1894 if (q.pi_state->owner == curr &&
1862 if (rt_mutex_trylock(&q.pi_state->pi_mutex)) 1895 rt_mutex_trylock(&q.pi_state->pi_mutex)) {
1863 ret = 0; 1896 ret = 0;
1897 } else {
1898 /*
1899 * Paranoia check. If we did not take the lock
1900 * in the trylock above, then we should not be
1901 * the owner of the rtmutex, neither the real
1902 * nor the pending one:
1903 */
1904 if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
1905 printk(KERN_ERR "futex_lock_pi: ret = %d "
1906 "pi-mutex: %p pi-state %p\n", ret,
1907 q.pi_state->pi_mutex.owner,
1908 q.pi_state->owner);
1864 } 1909 }
1865 /* Unqueue and drop the lock */
1866 unqueue_me_pi(&q);
1867 if (fshared)
1868 up_read(fshared);
1869 } 1910 }
1870 1911
1871 if (!detect && ret == -EDEADLK && 0) 1912 /* Unqueue and drop the lock */
1872 force_sig(SIGKILL, current); 1913 unqueue_me_pi(&q);
1914 if (fshared)
1915 up_read(fshared);
1873 1916
1874 return ret != -EINTR ? ret : -ERESTARTNOINTR; 1917 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1875 1918
@@ -1887,16 +1930,19 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1887 * non-atomically. Therefore, if get_user below is not 1930 * non-atomically. Therefore, if get_user below is not
1888 * enough, we need to handle the fault ourselves, while 1931 * enough, we need to handle the fault ourselves, while
1889 * still holding the mmap_sem. 1932 * still holding the mmap_sem.
1933 *
1934 * ... and hb->lock. :-) --ANK
1890 */ 1935 */
1936 queue_unlock(&q, hb);
1937
1891 if (attempt++) { 1938 if (attempt++) {
1892 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1939 ret = futex_handle_fault((unsigned long)uaddr, fshared,
1893 attempt); 1940 attempt);
1894 if (ret) 1941 if (ret)
1895 goto out_unlock_release_sem; 1942 goto out_release_sem;
1896 goto retry_locked; 1943 goto retry_unlocked;
1897 } 1944 }
1898 1945
1899 queue_unlock(&q, hb);
1900 if (fshared) 1946 if (fshared)
1901 up_read(fshared); 1947 up_read(fshared);
1902 1948
@@ -1940,9 +1986,9 @@ retry:
1940 goto out; 1986 goto out;
1941 1987
1942 hb = hash_futex(&key); 1988 hb = hash_futex(&key);
1989retry_unlocked:
1943 spin_lock(&hb->lock); 1990 spin_lock(&hb->lock);
1944 1991
1945retry_locked:
1946 /* 1992 /*
1947 * To avoid races, try to do the TID -> 0 atomic transition 1993 * To avoid races, try to do the TID -> 0 atomic transition
1948 * again. If it succeeds then we can return without waking 1994 * again. If it succeeds then we can return without waking
@@ -2005,16 +2051,19 @@ pi_faulted:
2005 * non-atomically. Therefore, if get_user below is not 2051 * non-atomically. Therefore, if get_user below is not
2006 * enough, we need to handle the fault ourselves, while 2052 * enough, we need to handle the fault ourselves, while
2007 * still holding the mmap_sem. 2053 * still holding the mmap_sem.
2054 *
2055 * ... and hb->lock. --ANK
2008 */ 2056 */
2057 spin_unlock(&hb->lock);
2058
2009 if (attempt++) { 2059 if (attempt++) {
2010 ret = futex_handle_fault((unsigned long)uaddr, fshared, 2060 ret = futex_handle_fault((unsigned long)uaddr, fshared,
2011 attempt); 2061 attempt);
2012 if (ret) 2062 if (ret)
2013 goto out_unlock; 2063 goto out;
2014 goto retry_locked; 2064 goto retry_unlocked;
2015 } 2065 }
2016 2066
2017 spin_unlock(&hb->lock);
2018 if (fshared) 2067 if (fshared)
2019 up_read(fshared); 2068 up_read(fshared);
2020 2069