diff options
author | Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> | 2007-06-08 16:47:00 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-06-08 20:23:34 -0400 |
commit | 778e9a9c3e7193ea9f434f382947155ffb59c755 (patch) | |
tree | 2ceb8c7ce1d55124982b77966dcd65cee5cc623b | |
parent | 1a539a87280b3032fd12bc93a4a82f1d8aa97ca8 (diff) |
pi-futex: fix exit races and locking problems
1. New entries can be added to tsk->pi_state_list after task completed
exit_pi_state_list(). The result is memory leakage and deadlocks.
2. handle_mm_fault() is called under spinlock. The result is obvious.
3. results in self-inflicted deadlock inside glibc.
Sometimes futex_lock_pi returns -ESRCH, when it is not expected
and glibc enters to for(;;) sleep() to simulate deadlock. This problem
is quite obvious and I think the patch is right. Though it looks like
each "if" in futex_lock_pi() got some stupid special case "else if". :-)
4. sometimes futex_lock_pi() returns -EDEADLK,
when nobody has the lock. The reason is also obvious (see comment
in the patch), but correct fix is far beyond my comprehension.
I guess someone already saw this, the chunk:
if (rt_mutex_trylock(&q.pi_state->pi_mutex))
ret = 0;
is obviously from the same opera. But it does not work, because the
rtmutex is really taken at this point: wake_futex_pi() of previous
owner reassigned it to us. My fix works. But it looks very stupid.
I would think about removal of shift of ownership in wake_futex_pi()
and making all the work in context of process taking lock.
From: Thomas Gleixner <tglx@linutronix.de>
Fix 1) Avoid the tasklist lock variant of the exit race fix by adding
an additional state transition to the exit code.
This fixes also the issue, when a task with recursive segfaults
is not able to release the futexes.
Fix 2) Cleanup the lookup_pi_state() failure path and solve the -ESRCH
problem finally.
Fix 3) Solve the fixup_pi_state_owner() problem which needs to do the fixup
in the lock protected section by using the in_atomic userspace access
functions.
This removes also the ugly lock drop / unqueue inside of fixup_pi_state()
Fix 4) Fix a stale lock in the error path of futex_wake_pi()
Added some error checks for verification.
The -EDEADLK problem is solved by the rtmutex fixups.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/exit.c | 24 | ||||
-rw-r--r-- | kernel/futex.c | 269 |
3 files changed, 183 insertions, 111 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index d58e74b98367..693f0e6c54d4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1162,6 +1162,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
1162 | /* Not implemented yet, only for 486*/ | 1162 | /* Not implemented yet, only for 486*/ |
1163 | #define PF_STARTING 0x00000002 /* being created */ | 1163 | #define PF_STARTING 0x00000002 /* being created */ |
1164 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1164 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1165 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | ||
1165 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1166 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
1166 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1167 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
1167 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1168 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
diff --git a/kernel/exit.c b/kernel/exit.c index 5b888c24e43e..5c8ecbaa19a5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -892,13 +892,29 @@ fastcall NORET_TYPE void do_exit(long code) | |||
892 | if (unlikely(tsk->flags & PF_EXITING)) { | 892 | if (unlikely(tsk->flags & PF_EXITING)) { |
893 | printk(KERN_ALERT | 893 | printk(KERN_ALERT |
894 | "Fixing recursive fault but reboot is needed!\n"); | 894 | "Fixing recursive fault but reboot is needed!\n"); |
895 | /* | ||
896 | * We can do this unlocked here. The futex code uses | ||
897 | * this flag just to verify whether the pi state | ||
898 | * cleanup has been done or not. In the worst case it | ||
899 | * loops once more. We pretend that the cleanup was | ||
900 | * done as there is no way to return. Either the | ||
901 | * OWNER_DIED bit is set by now or we push the blocked | ||
902 | * task into the wait for ever nirwana as well. | ||
903 | */ | ||
904 | tsk->flags |= PF_EXITPIDONE; | ||
895 | if (tsk->io_context) | 905 | if (tsk->io_context) |
896 | exit_io_context(); | 906 | exit_io_context(); |
897 | set_current_state(TASK_UNINTERRUPTIBLE); | 907 | set_current_state(TASK_UNINTERRUPTIBLE); |
898 | schedule(); | 908 | schedule(); |
899 | } | 909 | } |
900 | 910 | ||
911 | /* | ||
912 | * tsk->flags are checked in the futex code to protect against | ||
913 | * an exiting task cleaning up the robust pi futexes. | ||
914 | */ | ||
915 | spin_lock_irq(&tsk->pi_lock); | ||
901 | tsk->flags |= PF_EXITING; | 916 | tsk->flags |= PF_EXITING; |
917 | spin_unlock_irq(&tsk->pi_lock); | ||
902 | 918 | ||
903 | if (unlikely(in_atomic())) | 919 | if (unlikely(in_atomic())) |
904 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", | 920 | printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", |
@@ -912,7 +928,7 @@ fastcall NORET_TYPE void do_exit(long code) | |||
912 | } | 928 | } |
913 | group_dead = atomic_dec_and_test(&tsk->signal->live); | 929 | group_dead = atomic_dec_and_test(&tsk->signal->live); |
914 | if (group_dead) { | 930 | if (group_dead) { |
915 | hrtimer_cancel(&tsk->signal->real_timer); | 931 | hrtimer_cancel(&tsk->signal->real_timer); |
916 | exit_itimers(tsk->signal); | 932 | exit_itimers(tsk->signal); |
917 | } | 933 | } |
918 | acct_collect(code, group_dead); | 934 | acct_collect(code, group_dead); |
@@ -965,6 +981,12 @@ fastcall NORET_TYPE void do_exit(long code) | |||
965 | * Make sure we are holding no locks: | 981 | * Make sure we are holding no locks: |
966 | */ | 982 | */ |
967 | debug_check_no_locks_held(tsk); | 983 | debug_check_no_locks_held(tsk); |
984 | /* | ||
985 | * We can do this unlocked here. The futex code uses this flag | ||
986 | * just to verify whether the pi state cleanup has been done | ||
987 | * or not. In the worst case it loops once more. | ||
988 | */ | ||
989 | tsk->flags |= PF_EXITPIDONE; | ||
968 | 990 | ||
969 | if (tsk->io_context) | 991 | if (tsk->io_context) |
970 | exit_io_context(); | 992 | exit_io_context(); |
diff --git a/kernel/futex.c b/kernel/futex.c index b7ce15c67e32..3b7f7713d9a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -430,10 +430,6 @@ static struct task_struct * futex_find_get_task(pid_t pid) | |||
430 | p = NULL; | 430 | p = NULL; |
431 | goto out_unlock; | 431 | goto out_unlock; |
432 | } | 432 | } |
433 | if (p->exit_state != 0) { | ||
434 | p = NULL; | ||
435 | goto out_unlock; | ||
436 | } | ||
437 | get_task_struct(p); | 433 | get_task_struct(p); |
438 | out_unlock: | 434 | out_unlock: |
439 | rcu_read_unlock(); | 435 | rcu_read_unlock(); |
@@ -502,7 +498,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
502 | struct futex_q *this, *next; | 498 | struct futex_q *this, *next; |
503 | struct plist_head *head; | 499 | struct plist_head *head; |
504 | struct task_struct *p; | 500 | struct task_struct *p; |
505 | pid_t pid; | 501 | pid_t pid = uval & FUTEX_TID_MASK; |
506 | 502 | ||
507 | head = &hb->chain; | 503 | head = &hb->chain; |
508 | 504 | ||
@@ -520,6 +516,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
520 | return -EINVAL; | 516 | return -EINVAL; |
521 | 517 | ||
522 | WARN_ON(!atomic_read(&pi_state->refcount)); | 518 | WARN_ON(!atomic_read(&pi_state->refcount)); |
519 | WARN_ON(pid && pi_state->owner && | ||
520 | pi_state->owner->pid != pid); | ||
523 | 521 | ||
524 | atomic_inc(&pi_state->refcount); | 522 | atomic_inc(&pi_state->refcount); |
525 | *ps = pi_state; | 523 | *ps = pi_state; |
@@ -530,15 +528,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
530 | 528 | ||
531 | /* | 529 | /* |
532 | * We are the first waiter - try to look up the real owner and attach | 530 | * We are the first waiter - try to look up the real owner and attach |
533 | * the new pi_state to it, but bail out when the owner died bit is set | 531 | * the new pi_state to it, but bail out when TID = 0 |
534 | * and TID = 0: | ||
535 | */ | 532 | */ |
536 | pid = uval & FUTEX_TID_MASK; | 533 | if (!pid) |
537 | if (!pid && (uval & FUTEX_OWNER_DIED)) | ||
538 | return -ESRCH; | 534 | return -ESRCH; |
539 | p = futex_find_get_task(pid); | 535 | p = futex_find_get_task(pid); |
540 | if (!p) | 536 | if (IS_ERR(p)) |
541 | return -ESRCH; | 537 | return PTR_ERR(p); |
538 | |||
539 | /* | ||
540 | * We need to look at the task state flags to figure out, | ||
541 | * whether the task is exiting. To protect against the do_exit | ||
542 | * change of the task flags, we do this protected by | ||
543 | * p->pi_lock: | ||
544 | */ | ||
545 | spin_lock_irq(&p->pi_lock); | ||
546 | if (unlikely(p->flags & PF_EXITING)) { | ||
547 | /* | ||
548 | * The task is on the way out. When PF_EXITPIDONE is | ||
549 | * set, we know that the task has finished the | ||
550 | * cleanup: | ||
551 | */ | ||
552 | int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; | ||
553 | |||
554 | spin_unlock_irq(&p->pi_lock); | ||
555 | put_task_struct(p); | ||
556 | return ret; | ||
557 | } | ||
542 | 558 | ||
543 | pi_state = alloc_pi_state(); | 559 | pi_state = alloc_pi_state(); |
544 | 560 | ||
@@ -551,7 +567,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
551 | /* Store the key for possible exit cleanups: */ | 567 | /* Store the key for possible exit cleanups: */ |
552 | pi_state->key = *key; | 568 | pi_state->key = *key; |
553 | 569 | ||
554 | spin_lock_irq(&p->pi_lock); | ||
555 | WARN_ON(!list_empty(&pi_state->list)); | 570 | WARN_ON(!list_empty(&pi_state->list)); |
556 | list_add(&pi_state->list, &p->pi_state_list); | 571 | list_add(&pi_state->list, &p->pi_state_list); |
557 | pi_state->owner = p; | 572 | pi_state->owner = p; |
@@ -618,6 +633,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
618 | * preserve the owner died bit.) | 633 | * preserve the owner died bit.) |
619 | */ | 634 | */ |
620 | if (!(uval & FUTEX_OWNER_DIED)) { | 635 | if (!(uval & FUTEX_OWNER_DIED)) { |
636 | int ret = 0; | ||
637 | |||
621 | newval = FUTEX_WAITERS | new_owner->pid; | 638 | newval = FUTEX_WAITERS | new_owner->pid; |
622 | /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ | 639 | /* Keep the FUTEX_WAITER_REQUEUED flag if it was set */ |
623 | newval |= (uval & FUTEX_WAITER_REQUEUED); | 640 | newval |= (uval & FUTEX_WAITER_REQUEUED); |
@@ -625,10 +642,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
625 | pagefault_disable(); | 642 | pagefault_disable(); |
626 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 643 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
627 | pagefault_enable(); | 644 | pagefault_enable(); |
645 | |||
628 | if (curval == -EFAULT) | 646 | if (curval == -EFAULT) |
629 | return -EFAULT; | 647 | ret = -EFAULT; |
630 | if (curval != uval) | 648 | if (curval != uval) |
631 | return -EINVAL; | 649 | ret = -EINVAL; |
650 | if (ret) { | ||
651 | spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
652 | return ret; | ||
653 | } | ||
632 | } | 654 | } |
633 | 655 | ||
634 | spin_lock_irq(&pi_state->owner->pi_lock); | 656 | spin_lock_irq(&pi_state->owner->pi_lock); |
@@ -1174,7 +1196,7 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, | |||
1174 | #ifdef CONFIG_DEBUG_PI_LIST | 1196 | #ifdef CONFIG_DEBUG_PI_LIST |
1175 | this->list.plist.lock = &hb2->lock; | 1197 | this->list.plist.lock = &hb2->lock; |
1176 | #endif | 1198 | #endif |
1177 | } | 1199 | } |
1178 | this->key = key2; | 1200 | this->key = key2; |
1179 | get_futex_key_refs(&key2); | 1201 | get_futex_key_refs(&key2); |
1180 | drop_count++; | 1202 | drop_count++; |
@@ -1326,12 +1348,10 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1326 | /* | 1348 | /* |
1327 | * Fixup the pi_state owner with current. | 1349 | * Fixup the pi_state owner with current. |
1328 | * | 1350 | * |
1329 | * The cur->mm semaphore must be held, it is released at return of this | 1351 | * Must be called with hash bucket lock held and mm->sem held for non |
1330 | * function. | 1352 | * private futexes. |
1331 | */ | 1353 | */ |
1332 | static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, | 1354 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1333 | struct futex_q *q, | ||
1334 | struct futex_hash_bucket *hb, | ||
1335 | struct task_struct *curr) | 1355 | struct task_struct *curr) |
1336 | { | 1356 | { |
1337 | u32 newtid = curr->pid | FUTEX_WAITERS; | 1357 | u32 newtid = curr->pid | FUTEX_WAITERS; |
@@ -1355,23 +1375,24 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1355 | list_add(&pi_state->list, &curr->pi_state_list); | 1375 | list_add(&pi_state->list, &curr->pi_state_list); |
1356 | spin_unlock_irq(&curr->pi_lock); | 1376 | spin_unlock_irq(&curr->pi_lock); |
1357 | 1377 | ||
1358 | /* Unqueue and drop the lock */ | ||
1359 | unqueue_me_pi(q); | ||
1360 | if (fshared) | ||
1361 | up_read(fshared); | ||
1362 | /* | 1378 | /* |
1363 | * We own it, so we have to replace the pending owner | 1379 | * We own it, so we have to replace the pending owner |
1364 | * TID. This must be atomic as we have preserve the | 1380 | * TID. This must be atomic as we have preserve the |
1365 | * owner died bit here. | 1381 | * owner died bit here. |
1366 | */ | 1382 | */ |
1367 | ret = get_user(uval, uaddr); | 1383 | ret = get_futex_value_locked(&uval, uaddr); |
1384 | |||
1368 | while (!ret) { | 1385 | while (!ret) { |
1369 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1386 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1370 | newval |= (uval & FUTEX_WAITER_REQUEUED); | 1387 | newval |= (uval & FUTEX_WAITER_REQUEUED); |
1388 | |||
1389 | pagefault_disable(); | ||
1371 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | 1390 | curval = futex_atomic_cmpxchg_inatomic(uaddr, |
1372 | uval, newval); | 1391 | uval, newval); |
1392 | pagefault_enable(); | ||
1393 | |||
1373 | if (curval == -EFAULT) | 1394 | if (curval == -EFAULT) |
1374 | ret = -EFAULT; | 1395 | ret = -EFAULT; |
1375 | if (curval == uval) | 1396 | if (curval == uval) |
1376 | break; | 1397 | break; |
1377 | uval = curval; | 1398 | uval = curval; |
@@ -1553,10 +1574,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1553 | */ | 1574 | */ |
1554 | uaddr = q.pi_state->key.uaddr; | 1575 | uaddr = q.pi_state->key.uaddr; |
1555 | 1576 | ||
1556 | /* mmap_sem and hash_bucket lock are unlocked at | 1577 | ret = fixup_pi_state_owner(uaddr, &q, curr); |
1557 | return of this function */ | ||
1558 | ret = fixup_pi_state_owner(uaddr, fshared, | ||
1559 | &q, hb, curr); | ||
1560 | } else { | 1578 | } else { |
1561 | /* | 1579 | /* |
1562 | * Catch the rare case, where the lock was released | 1580 | * Catch the rare case, where the lock was released |
@@ -1567,12 +1585,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1567 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | 1585 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) |
1568 | ret = 0; | 1586 | ret = 0; |
1569 | } | 1587 | } |
1570 | /* Unqueue and drop the lock */ | ||
1571 | unqueue_me_pi(&q); | ||
1572 | if (fshared) | ||
1573 | up_read(fshared); | ||
1574 | } | 1588 | } |
1575 | 1589 | ||
1590 | /* Unqueue and drop the lock */ | ||
1591 | unqueue_me_pi(&q); | ||
1592 | if (fshared) | ||
1593 | up_read(fshared); | ||
1594 | |||
1576 | debug_rt_mutex_free_waiter(&q.waiter); | 1595 | debug_rt_mutex_free_waiter(&q.waiter); |
1577 | 1596 | ||
1578 | return ret; | 1597 | return ret; |
@@ -1688,7 +1707,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1688 | struct futex_hash_bucket *hb; | 1707 | struct futex_hash_bucket *hb; |
1689 | u32 uval, newval, curval; | 1708 | u32 uval, newval, curval; |
1690 | struct futex_q q; | 1709 | struct futex_q q; |
1691 | int ret, lock_held, attempt = 0; | 1710 | int ret, lock_taken, ownerdied = 0, attempt = 0; |
1692 | 1711 | ||
1693 | if (refill_pi_state_cache()) | 1712 | if (refill_pi_state_cache()) |
1694 | return -ENOMEM; | 1713 | return -ENOMEM; |
@@ -1709,10 +1728,11 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1709 | if (unlikely(ret != 0)) | 1728 | if (unlikely(ret != 0)) |
1710 | goto out_release_sem; | 1729 | goto out_release_sem; |
1711 | 1730 | ||
1731 | retry_unlocked: | ||
1712 | hb = queue_lock(&q, -1, NULL); | 1732 | hb = queue_lock(&q, -1, NULL); |
1713 | 1733 | ||
1714 | retry_locked: | 1734 | retry_locked: |
1715 | lock_held = 0; | 1735 | ret = lock_taken = 0; |
1716 | 1736 | ||
1717 | /* | 1737 | /* |
1718 | * To avoid races, we attempt to take the lock here again | 1738 | * To avoid races, we attempt to take the lock here again |
@@ -1728,43 +1748,44 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1728 | if (unlikely(curval == -EFAULT)) | 1748 | if (unlikely(curval == -EFAULT)) |
1729 | goto uaddr_faulted; | 1749 | goto uaddr_faulted; |
1730 | 1750 | ||
1731 | /* We own the lock already */ | 1751 | /* |
1752 | * Detect deadlocks. In case of REQUEUE_PI this is a valid | ||
1753 | * situation and we return success to user space. | ||
1754 | */ | ||
1732 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | 1755 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { |
1733 | if (!detect && 0) | ||
1734 | force_sig(SIGKILL, current); | ||
1735 | /* | ||
1736 | * Normally, this check is done in user space. | ||
1737 | * In case of requeue, the owner may attempt to lock this futex, | ||
1738 | * even if the ownership has already been given by the previous | ||
1739 | * waker. | ||
1740 | * In the usual case, this is a case of deadlock, but not in case | ||
1741 | * of REQUEUE_PI. | ||
1742 | */ | ||
1743 | if (!(curval & FUTEX_WAITER_REQUEUED)) | 1756 | if (!(curval & FUTEX_WAITER_REQUEUED)) |
1744 | ret = -EDEADLK; | 1757 | ret = -EDEADLK; |
1745 | goto out_unlock_release_sem; | 1758 | goto out_unlock_release_sem; |
1746 | } | 1759 | } |
1747 | 1760 | ||
1748 | /* | 1761 | /* |
1749 | * Surprise - we got the lock. Just return | 1762 | * Surprise - we got the lock. Just return to userspace: |
1750 | * to userspace: | ||
1751 | */ | 1763 | */ |
1752 | if (unlikely(!curval)) | 1764 | if (unlikely(!curval)) |
1753 | goto out_unlock_release_sem; | 1765 | goto out_unlock_release_sem; |
1754 | 1766 | ||
1755 | uval = curval; | 1767 | uval = curval; |
1768 | |||
1756 | /* | 1769 | /* |
1757 | * In case of a requeue, check if there already is an owner | 1770 | * Set the WAITERS flag, so the owner will know it has someone |
1758 | * If not, just take the futex. | 1771 | * to wake at next unlock |
1759 | */ | 1772 | */ |
1760 | if ((curval & FUTEX_WAITER_REQUEUED) && !(curval & FUTEX_TID_MASK)) { | 1773 | newval = curval | FUTEX_WAITERS; |
1761 | /* set current as futex owner */ | 1774 | |
1762 | newval = curval | current->pid; | 1775 | /* |
1763 | lock_held = 1; | 1776 | * There are two cases, where a futex might have no owner (the |
1764 | } else | 1777 | * owner TID is 0): OWNER_DIED or REQUEUE. We take over the |
1765 | /* Set the WAITERS flag, so the owner will know it has someone | 1778 | * futex in this case. We also do an unconditional take over, |
1766 | to wake at next unlock */ | 1779 | * when the owner of the futex died. |
1767 | newval = curval | FUTEX_WAITERS; | 1780 | * |
1781 | * This is safe as we are protected by the hash bucket lock ! | ||
1782 | */ | ||
1783 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | ||
1784 | /* Keep the OWNER_DIED and REQUEUE bits */ | ||
1785 | newval = (curval & ~FUTEX_TID_MASK) | current->pid; | ||
1786 | ownerdied = 0; | ||
1787 | lock_taken = 1; | ||
1788 | } | ||
1768 | 1789 | ||
1769 | pagefault_disable(); | 1790 | pagefault_disable(); |
1770 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 1791 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
@@ -1775,8 +1796,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1775 | if (unlikely(curval != uval)) | 1796 | if (unlikely(curval != uval)) |
1776 | goto retry_locked; | 1797 | goto retry_locked; |
1777 | 1798 | ||
1778 | if (lock_held) { | 1799 | /* |
1779 | set_pi_futex_owner(hb, &q.key, curr); | 1800 | * We took the lock due to requeue or owner died take over. |
1801 | */ | ||
1802 | if (unlikely(lock_taken)) { | ||
1803 | /* For requeue we need to fixup the pi_futex */ | ||
1804 | if (curval & FUTEX_WAITER_REQUEUED) | ||
1805 | set_pi_futex_owner(hb, &q.key, curr); | ||
1780 | goto out_unlock_release_sem; | 1806 | goto out_unlock_release_sem; |
1781 | } | 1807 | } |
1782 | 1808 | ||
@@ -1787,34 +1813,40 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1787 | ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); | 1813 | ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); |
1788 | 1814 | ||
1789 | if (unlikely(ret)) { | 1815 | if (unlikely(ret)) { |
1790 | /* | 1816 | switch (ret) { |
1791 | * There were no waiters and the owner task lookup | ||
1792 | * failed. When the OWNER_DIED bit is set, then we | ||
1793 | * know that this is a robust futex and we actually | ||
1794 | * take the lock. This is safe as we are protected by | ||
1795 | * the hash bucket lock. We also set the waiters bit | ||
1796 | * unconditionally here, to simplify glibc handling of | ||
1797 | * multiple tasks racing to acquire the lock and | ||
1798 | * cleanup the problems which were left by the dead | ||
1799 | * owner. | ||
1800 | */ | ||
1801 | if (curval & FUTEX_OWNER_DIED) { | ||
1802 | uval = newval; | ||
1803 | newval = current->pid | | ||
1804 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1805 | 1817 | ||
1806 | pagefault_disable(); | 1818 | case -EAGAIN: |
1807 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | 1819 | /* |
1808 | uval, newval); | 1820 | * Task is exiting and we just wait for the |
1809 | pagefault_enable(); | 1821 | * exit to complete. |
1822 | */ | ||
1823 | queue_unlock(&q, hb); | ||
1824 | if (fshared) | ||
1825 | up_read(fshared); | ||
1826 | cond_resched(); | ||
1827 | goto retry; | ||
1810 | 1828 | ||
1811 | if (unlikely(curval == -EFAULT)) | 1829 | case -ESRCH: |
1830 | /* | ||
1831 | * No owner found for this futex. Check if the | ||
1832 | * OWNER_DIED bit is set to figure out whether | ||
1833 | * this is a robust futex or not. | ||
1834 | */ | ||
1835 | if (get_futex_value_locked(&curval, uaddr)) | ||
1812 | goto uaddr_faulted; | 1836 | goto uaddr_faulted; |
1813 | if (unlikely(curval != uval)) | 1837 | |
1838 | /* | ||
1839 | * We simply start over in case of a robust | ||
1840 | * futex. The code above will take the futex | ||
1841 | * and return happy. | ||
1842 | */ | ||
1843 | if (curval & FUTEX_OWNER_DIED) { | ||
1844 | ownerdied = 1; | ||
1814 | goto retry_locked; | 1845 | goto retry_locked; |
1815 | ret = 0; | 1846 | } |
1847 | default: | ||
1848 | goto out_unlock_release_sem; | ||
1816 | } | 1849 | } |
1817 | goto out_unlock_release_sem; | ||
1818 | } | 1850 | } |
1819 | 1851 | ||
1820 | /* | 1852 | /* |
@@ -1845,31 +1877,42 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1845 | down_read(fshared); | 1877 | down_read(fshared); |
1846 | spin_lock(q.lock_ptr); | 1878 | spin_lock(q.lock_ptr); |
1847 | 1879 | ||
1848 | /* | 1880 | if (!ret) { |
1849 | * Got the lock. We might not be the anticipated owner if we | 1881 | /* |
1850 | * did a lock-steal - fix up the PI-state in that case. | 1882 | * Got the lock. We might not be the anticipated owner |
1851 | */ | 1883 | * if we did a lock-steal - fix up the PI-state in |
1852 | if (!ret && q.pi_state->owner != curr) | 1884 | * that case: |
1853 | /* mmap_sem is unlocked at return of this function */ | 1885 | */ |
1854 | ret = fixup_pi_state_owner(uaddr, fshared, &q, hb, curr); | 1886 | if (q.pi_state->owner != curr) |
1855 | else { | 1887 | ret = fixup_pi_state_owner(uaddr, &q, curr); |
1888 | } else { | ||
1856 | /* | 1889 | /* |
1857 | * Catch the rare case, where the lock was released | 1890 | * Catch the rare case, where the lock was released |
1858 | * when we were on the way back before we locked | 1891 | * when we were on the way back before we locked the |
1859 | * the hash bucket. | 1892 | * hash bucket. |
1860 | */ | 1893 | */ |
1861 | if (ret && q.pi_state->owner == curr) { | 1894 | if (q.pi_state->owner == curr && |
1862 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | 1895 | rt_mutex_trylock(&q.pi_state->pi_mutex)) { |
1863 | ret = 0; | 1896 | ret = 0; |
1897 | } else { | ||
1898 | /* | ||
1899 | * Paranoia check. If we did not take the lock | ||
1900 | * in the trylock above, then we should not be | ||
1901 | * the owner of the rtmutex, neither the real | ||
1902 | * nor the pending one: | ||
1903 | */ | ||
1904 | if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) | ||
1905 | printk(KERN_ERR "futex_lock_pi: ret = %d " | ||
1906 | "pi-mutex: %p pi-state %p\n", ret, | ||
1907 | q.pi_state->pi_mutex.owner, | ||
1908 | q.pi_state->owner); | ||
1864 | } | 1909 | } |
1865 | /* Unqueue and drop the lock */ | ||
1866 | unqueue_me_pi(&q); | ||
1867 | if (fshared) | ||
1868 | up_read(fshared); | ||
1869 | } | 1910 | } |
1870 | 1911 | ||
1871 | if (!detect && ret == -EDEADLK && 0) | 1912 | /* Unqueue and drop the lock */ |
1872 | force_sig(SIGKILL, current); | 1913 | unqueue_me_pi(&q); |
1914 | if (fshared) | ||
1915 | up_read(fshared); | ||
1873 | 1916 | ||
1874 | return ret != -EINTR ? ret : -ERESTARTNOINTR; | 1917 | return ret != -EINTR ? ret : -ERESTARTNOINTR; |
1875 | 1918 | ||
@@ -1887,16 +1930,19 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1887 | * non-atomically. Therefore, if get_user below is not | 1930 | * non-atomically. Therefore, if get_user below is not |
1888 | * enough, we need to handle the fault ourselves, while | 1931 | * enough, we need to handle the fault ourselves, while |
1889 | * still holding the mmap_sem. | 1932 | * still holding the mmap_sem. |
1933 | * | ||
1934 | * ... and hb->lock. :-) --ANK | ||
1890 | */ | 1935 | */ |
1936 | queue_unlock(&q, hb); | ||
1937 | |||
1891 | if (attempt++) { | 1938 | if (attempt++) { |
1892 | ret = futex_handle_fault((unsigned long)uaddr, fshared, | 1939 | ret = futex_handle_fault((unsigned long)uaddr, fshared, |
1893 | attempt); | 1940 | attempt); |
1894 | if (ret) | 1941 | if (ret) |
1895 | goto out_unlock_release_sem; | 1942 | goto out_release_sem; |
1896 | goto retry_locked; | 1943 | goto retry_unlocked; |
1897 | } | 1944 | } |
1898 | 1945 | ||
1899 | queue_unlock(&q, hb); | ||
1900 | if (fshared) | 1946 | if (fshared) |
1901 | up_read(fshared); | 1947 | up_read(fshared); |
1902 | 1948 | ||
@@ -1940,9 +1986,9 @@ retry: | |||
1940 | goto out; | 1986 | goto out; |
1941 | 1987 | ||
1942 | hb = hash_futex(&key); | 1988 | hb = hash_futex(&key); |
1989 | retry_unlocked: | ||
1943 | spin_lock(&hb->lock); | 1990 | spin_lock(&hb->lock); |
1944 | 1991 | ||
1945 | retry_locked: | ||
1946 | /* | 1992 | /* |
1947 | * To avoid races, try to do the TID -> 0 atomic transition | 1993 | * To avoid races, try to do the TID -> 0 atomic transition |
1948 | * again. If it succeeds then we can return without waking | 1994 | * again. If it succeeds then we can return without waking |
@@ -2005,16 +2051,19 @@ pi_faulted: | |||
2005 | * non-atomically. Therefore, if get_user below is not | 2051 | * non-atomically. Therefore, if get_user below is not |
2006 | * enough, we need to handle the fault ourselves, while | 2052 | * enough, we need to handle the fault ourselves, while |
2007 | * still holding the mmap_sem. | 2053 | * still holding the mmap_sem. |
2054 | * | ||
2055 | * ... and hb->lock. --ANK | ||
2008 | */ | 2056 | */ |
2057 | spin_unlock(&hb->lock); | ||
2058 | |||
2009 | if (attempt++) { | 2059 | if (attempt++) { |
2010 | ret = futex_handle_fault((unsigned long)uaddr, fshared, | 2060 | ret = futex_handle_fault((unsigned long)uaddr, fshared, |
2011 | attempt); | 2061 | attempt); |
2012 | if (ret) | 2062 | if (ret) |
2013 | goto out_unlock; | 2063 | goto out; |
2014 | goto retry_locked; | 2064 | goto retry_unlocked; |
2015 | } | 2065 | } |
2016 | 2066 | ||
2017 | spin_unlock(&hb->lock); | ||
2018 | if (fshared) | 2067 | if (fshared) |
2019 | up_read(fshared); | 2068 | up_read(fshared); |
2020 | 2069 | ||