diff options
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 147 |
1 files changed, 76 insertions, 71 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index b766d28accd6..bda415715382 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, | |||
381 | return NULL; | 381 | return NULL; |
382 | } | 382 | } |
383 | 383 | ||
384 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 384 | static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, |
385 | u32 uval, u32 newval) | ||
385 | { | 386 | { |
386 | u32 curval; | 387 | int ret; |
387 | 388 | ||
388 | pagefault_disable(); | 389 | pagefault_disable(); |
389 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 390 | ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); |
390 | pagefault_enable(); | 391 | pagefault_enable(); |
391 | 392 | ||
392 | return curval; | 393 | return ret; |
393 | } | 394 | } |
394 | 395 | ||
395 | static int get_futex_value_locked(u32 *dest, u32 __user *from) | 396 | static int get_futex_value_locked(u32 *dest, u32 __user *from) |
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, | |||
674 | struct task_struct *task, int set_waiters) | 675 | struct task_struct *task, int set_waiters) |
675 | { | 676 | { |
676 | int lock_taken, ret, ownerdied = 0; | 677 | int lock_taken, ret, ownerdied = 0; |
677 | u32 uval, newval, curval; | 678 | u32 uval, newval, curval, vpid = task_pid_vnr(task); |
678 | 679 | ||
679 | retry: | 680 | retry: |
680 | ret = lock_taken = 0; | 681 | ret = lock_taken = 0; |
@@ -684,19 +685,17 @@ retry: | |||
684 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | 685 | * (by doing a 0 -> TID atomic cmpxchg), while holding all |
685 | * the locks. It will most likely not succeed. | 686 | * the locks. It will most likely not succeed. |
686 | */ | 687 | */ |
687 | newval = task_pid_vnr(task); | 688 | newval = vpid; |
688 | if (set_waiters) | 689 | if (set_waiters) |
689 | newval |= FUTEX_WAITERS; | 690 | newval |= FUTEX_WAITERS; |
690 | 691 | ||
691 | curval = cmpxchg_futex_value_locked(uaddr, 0, newval); | 692 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) |
692 | |||
693 | if (unlikely(curval == -EFAULT)) | ||
694 | return -EFAULT; | 693 | return -EFAULT; |
695 | 694 | ||
696 | /* | 695 | /* |
697 | * Detect deadlocks. | 696 | * Detect deadlocks. |
698 | */ | 697 | */ |
699 | if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) | 698 | if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) |
700 | return -EDEADLK; | 699 | return -EDEADLK; |
701 | 700 | ||
702 | /* | 701 | /* |
@@ -723,14 +722,12 @@ retry: | |||
723 | */ | 722 | */ |
724 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { | 723 | if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { |
725 | /* Keep the OWNER_DIED bit */ | 724 | /* Keep the OWNER_DIED bit */ |
726 | newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); | 725 | newval = (curval & ~FUTEX_TID_MASK) | vpid; |
727 | ownerdied = 0; | 726 | ownerdied = 0; |
728 | lock_taken = 1; | 727 | lock_taken = 1; |
729 | } | 728 | } |
730 | 729 | ||
731 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 730 | if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) |
732 | |||
733 | if (unlikely(curval == -EFAULT)) | ||
734 | return -EFAULT; | 731 | return -EFAULT; |
735 | if (unlikely(curval != uval)) | 732 | if (unlikely(curval != uval)) |
736 | goto retry; | 733 | goto retry; |
@@ -775,6 +772,24 @@ retry: | |||
775 | return ret; | 772 | return ret; |
776 | } | 773 | } |
777 | 774 | ||
775 | /** | ||
776 | * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket | ||
777 | * @q: The futex_q to unqueue | ||
778 | * | ||
779 | * The q->lock_ptr must not be NULL and must be held by the caller. | ||
780 | */ | ||
781 | static void __unqueue_futex(struct futex_q *q) | ||
782 | { | ||
783 | struct futex_hash_bucket *hb; | ||
784 | |||
785 | if (WARN_ON(!q->lock_ptr || !spin_is_locked(q->lock_ptr) | ||
786 | || plist_node_empty(&q->list))) | ||
787 | return; | ||
788 | |||
789 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | ||
790 | plist_del(&q->list, &hb->chain); | ||
791 | } | ||
792 | |||
778 | /* | 793 | /* |
779 | * The hash bucket lock must be held when this is called. | 794 | * The hash bucket lock must be held when this is called. |
780 | * Afterwards, the futex_q must not be accessed. | 795 | * Afterwards, the futex_q must not be accessed. |
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q) | |||
792 | */ | 807 | */ |
793 | get_task_struct(p); | 808 | get_task_struct(p); |
794 | 809 | ||
795 | plist_del(&q->list, &q->list.plist); | 810 | __unqueue_futex(q); |
796 | /* | 811 | /* |
797 | * The waiting task can free the futex_q as soon as | 812 | * The waiting task can free the futex_q as soon as |
798 | * q->lock_ptr = NULL is written, without taking any locks. A | 813 | * q->lock_ptr = NULL is written, without taking any locks. A |
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
843 | 858 | ||
844 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 859 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
845 | 860 | ||
846 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 861 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
847 | |||
848 | if (curval == -EFAULT) | ||
849 | ret = -EFAULT; | 862 | ret = -EFAULT; |
850 | else if (curval != uval) | 863 | else if (curval != uval) |
851 | ret = -EINVAL; | 864 | ret = -EINVAL; |
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
880 | * There is no waiter, so we unlock the futex. The owner died | 893 | * There is no waiter, so we unlock the futex. The owner died |
881 | * bit has not to be preserved here. We are the owner: | 894 | * bit has not to be preserved here. We are the owner: |
882 | */ | 895 | */ |
883 | oldval = cmpxchg_futex_value_locked(uaddr, uval, 0); | 896 | if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) |
884 | 897 | return -EFAULT; | |
885 | if (oldval == -EFAULT) | ||
886 | return oldval; | ||
887 | if (oldval != uval) | 898 | if (oldval != uval) |
888 | return -EAGAIN; | 899 | return -EAGAIN; |
889 | 900 | ||
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
1071 | plist_del(&q->list, &hb1->chain); | 1082 | plist_del(&q->list, &hb1->chain); |
1072 | plist_add(&q->list, &hb2->chain); | 1083 | plist_add(&q->list, &hb2->chain); |
1073 | q->lock_ptr = &hb2->lock; | 1084 | q->lock_ptr = &hb2->lock; |
1074 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1075 | q->list.plist.spinlock = &hb2->lock; | ||
1076 | #endif | ||
1077 | } | 1085 | } |
1078 | get_futex_key_refs(key2); | 1086 | get_futex_key_refs(key2); |
1079 | q->key = *key2; | 1087 | q->key = *key2; |
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
1100 | get_futex_key_refs(key); | 1108 | get_futex_key_refs(key); |
1101 | q->key = *key; | 1109 | q->key = *key; |
1102 | 1110 | ||
1103 | WARN_ON(plist_node_empty(&q->list)); | 1111 | __unqueue_futex(q); |
1104 | plist_del(&q->list, &q->list.plist); | ||
1105 | 1112 | ||
1106 | WARN_ON(!q->rt_waiter); | 1113 | WARN_ON(!q->rt_waiter); |
1107 | q->rt_waiter = NULL; | 1114 | q->rt_waiter = NULL; |
1108 | 1115 | ||
1109 | q->lock_ptr = &hb->lock; | 1116 | q->lock_ptr = &hb->lock; |
1110 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1111 | q->list.plist.spinlock = &hb->lock; | ||
1112 | #endif | ||
1113 | 1117 | ||
1114 | wake_up_state(q->task, TASK_NORMAL); | 1118 | wake_up_state(q->task, TASK_NORMAL); |
1115 | } | 1119 | } |
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) | |||
1457 | prio = min(current->normal_prio, MAX_RT_PRIO); | 1461 | prio = min(current->normal_prio, MAX_RT_PRIO); |
1458 | 1462 | ||
1459 | plist_node_init(&q->list, prio); | 1463 | plist_node_init(&q->list, prio); |
1460 | #ifdef CONFIG_DEBUG_PI_LIST | ||
1461 | q->list.plist.spinlock = &hb->lock; | ||
1462 | #endif | ||
1463 | plist_add(&q->list, &hb->chain); | 1464 | plist_add(&q->list, &hb->chain); |
1464 | q->task = current; | 1465 | q->task = current; |
1465 | spin_unlock(&hb->lock); | 1466 | spin_unlock(&hb->lock); |
@@ -1504,8 +1505,7 @@ retry: | |||
1504 | spin_unlock(lock_ptr); | 1505 | spin_unlock(lock_ptr); |
1505 | goto retry; | 1506 | goto retry; |
1506 | } | 1507 | } |
1507 | WARN_ON(plist_node_empty(&q->list)); | 1508 | __unqueue_futex(q); |
1508 | plist_del(&q->list, &q->list.plist); | ||
1509 | 1509 | ||
1510 | BUG_ON(q->pi_state); | 1510 | BUG_ON(q->pi_state); |
1511 | 1511 | ||
@@ -1525,8 +1525,7 @@ retry: | |||
1525 | static void unqueue_me_pi(struct futex_q *q) | 1525 | static void unqueue_me_pi(struct futex_q *q) |
1526 | __releases(q->lock_ptr) | 1526 | __releases(q->lock_ptr) |
1527 | { | 1527 | { |
1528 | WARN_ON(plist_node_empty(&q->list)); | 1528 | __unqueue_futex(q); |
1529 | plist_del(&q->list, &q->list.plist); | ||
1530 | 1529 | ||
1531 | BUG_ON(!q->pi_state); | 1530 | BUG_ON(!q->pi_state); |
1532 | free_pi_state(q->pi_state); | 1531 | free_pi_state(q->pi_state); |
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1556 | 1555 | ||
1557 | /* | 1556 | /* |
1558 | * We are here either because we stole the rtmutex from the | 1557 | * We are here either because we stole the rtmutex from the |
1559 | * pending owner or we are the pending owner which failed to | 1558 | * previous highest priority waiter or we are the highest priority |
1560 | * get the rtmutex. We have to replace the pending owner TID | 1559 | * waiter but failed to get the rtmutex the first time. |
1561 | * in the user space variable. This must be atomic as we have | 1560 | * We have to replace the newowner TID in the user space variable. |
1562 | * to preserve the owner died bit here. | 1561 | * This must be atomic as we have to preserve the owner died bit here. |
1563 | * | 1562 | * |
1564 | * Note: We write the user space value _before_ changing the pi_state | 1563 | * Note: We write the user space value _before_ changing the pi_state |
1565 | * because we can fault here. Imagine swapped out pages or a fork | 1564 | * because we can fault here. Imagine swapped out pages or a fork |
@@ -1578,9 +1577,7 @@ retry: | |||
1578 | while (1) { | 1577 | while (1) { |
1579 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | 1578 | newval = (uval & FUTEX_OWNER_DIED) | newtid; |
1580 | 1579 | ||
1581 | curval = cmpxchg_futex_value_locked(uaddr, uval, newval); | 1580 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
1582 | |||
1583 | if (curval == -EFAULT) | ||
1584 | goto handle_fault; | 1581 | goto handle_fault; |
1585 | if (curval == uval) | 1582 | if (curval == uval) |
1586 | break; | 1583 | break; |
@@ -1608,8 +1605,8 @@ retry: | |||
1608 | 1605 | ||
1609 | /* | 1606 | /* |
1610 | * To handle the page fault we need to drop the hash bucket | 1607 | * To handle the page fault we need to drop the hash bucket |
1611 | * lock here. That gives the other task (either the pending | 1608 | * lock here. That gives the other task (either the highest priority |
1612 | * owner itself or the task which stole the rtmutex) the | 1609 | * waiter itself or the task which stole the rtmutex) the |
1613 | * chance to try the fixup of the pi_state. So once we are | 1610 | * chance to try the fixup of the pi_state. So once we are |
1614 | * back from handling the fault we need to check the pi_state | 1611 | * back from handling the fault we need to check the pi_state |
1615 | * after reacquiring the hash bucket lock and before trying to | 1612 | * after reacquiring the hash bucket lock and before trying to |
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) | |||
1685 | /* | 1682 | /* |
1686 | * pi_state is incorrect, some other task did a lock steal and | 1683 | * pi_state is incorrect, some other task did a lock steal and |
1687 | * we returned due to timeout or signal without taking the | 1684 | * we returned due to timeout or signal without taking the |
1688 | * rt_mutex. Too late. We can access the rt_mutex_owner without | 1685 | * rt_mutex. Too late. |
1689 | * locking, as the other task is now blocked on the hash bucket | ||
1690 | * lock. Fix the state up. | ||
1691 | */ | 1686 | */ |
1687 | raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); | ||
1692 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); | 1688 | owner = rt_mutex_owner(&q->pi_state->pi_mutex); |
1689 | if (!owner) | ||
1690 | owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); | ||
1691 | raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); | ||
1693 | ret = fixup_pi_state_owner(uaddr, q, owner); | 1692 | ret = fixup_pi_state_owner(uaddr, q, owner); |
1694 | goto out; | 1693 | goto out; |
1695 | } | 1694 | } |
1696 | 1695 | ||
1697 | /* | 1696 | /* |
1698 | * Paranoia check. If we did not take the lock, then we should not be | 1697 | * Paranoia check. If we did not take the lock, then we should not be |
1699 | * the owner, nor the pending owner, of the rt_mutex. | 1698 | * the owner of the rt_mutex. |
1700 | */ | 1699 | */ |
1701 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) | 1700 | if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) |
1702 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " | 1701 | printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " |
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, | |||
1781 | * | 1780 | * |
1782 | * The basic logical guarantee of a futex is that it blocks ONLY | 1781 | * The basic logical guarantee of a futex is that it blocks ONLY |
1783 | * if cond(var) is known to be true at the time of blocking, for | 1782 | * if cond(var) is known to be true at the time of blocking, for |
1784 | * any cond. If we queued after testing *uaddr, that would open | 1783 | * any cond. If we locked the hash-bucket after testing *uaddr, that |
1785 | * a race condition where we could block indefinitely with | 1784 | * would open a race condition where we could block indefinitely with |
1786 | * cond(var) false, which would violate the guarantee. | 1785 | * cond(var) false, which would violate the guarantee. |
1787 | * | 1786 | * |
1788 | * A consequence is that futex_wait() can return zero and absorb | 1787 | * On the other hand, we insert q and release the hash-bucket only |
1789 | * a wakeup when *uaddr != val on entry to the syscall. This is | 1788 | * after testing *uaddr. This guarantees that futex_wait() will NOT |
1790 | * rare, but normal. | 1789 | * absorb a wakeup if *uaddr does not match the desired values |
1790 | * while the syscall executes. | ||
1791 | */ | 1791 | */ |
1792 | retry: | 1792 | retry: |
1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); | 1793 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key); |
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) | |||
2046 | { | 2046 | { |
2047 | struct futex_hash_bucket *hb; | 2047 | struct futex_hash_bucket *hb; |
2048 | struct futex_q *this, *next; | 2048 | struct futex_q *this, *next; |
2049 | u32 uval; | ||
2050 | struct plist_head *head; | 2049 | struct plist_head *head; |
2051 | union futex_key key = FUTEX_KEY_INIT; | 2050 | union futex_key key = FUTEX_KEY_INIT; |
2051 | u32 uval, vpid = task_pid_vnr(current); | ||
2052 | int ret; | 2052 | int ret; |
2053 | 2053 | ||
2054 | retry: | 2054 | retry: |
@@ -2057,7 +2057,7 @@ retry: | |||
2057 | /* | 2057 | /* |
2058 | * We release only a lock we actually own: | 2058 | * We release only a lock we actually own: |
2059 | */ | 2059 | */ |
2060 | if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) | 2060 | if ((uval & FUTEX_TID_MASK) != vpid) |
2061 | return -EPERM; | 2061 | return -EPERM; |
2062 | 2062 | ||
2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); | 2063 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key); |
@@ -2072,17 +2072,14 @@ retry: | |||
2072 | * again. If it succeeds then we can return without waking | 2072 | * again. If it succeeds then we can return without waking |
2073 | * anyone else up: | 2073 | * anyone else up: |
2074 | */ | 2074 | */ |
2075 | if (!(uval & FUTEX_OWNER_DIED)) | 2075 | if (!(uval & FUTEX_OWNER_DIED) && |
2076 | uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); | 2076 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
2077 | |||
2078 | |||
2079 | if (unlikely(uval == -EFAULT)) | ||
2080 | goto pi_faulted; | 2077 | goto pi_faulted; |
2081 | /* | 2078 | /* |
2082 | * Rare case: we managed to release the lock atomically, | 2079 | * Rare case: we managed to release the lock atomically, |
2083 | * no need to wake anyone else up: | 2080 | * no need to wake anyone else up: |
2084 | */ | 2081 | */ |
2085 | if (unlikely(uval == task_pid_vnr(current))) | 2082 | if (unlikely(uval == vpid)) |
2086 | goto out_unlock; | 2083 | goto out_unlock; |
2087 | 2084 | ||
2088 | /* | 2085 | /* |
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2167 | * We were woken prior to requeue by a timeout or a signal. | 2164 | * We were woken prior to requeue by a timeout or a signal. |
2168 | * Unqueue the futex_q and determine which it was. | 2165 | * Unqueue the futex_q and determine which it was. |
2169 | */ | 2166 | */ |
2170 | plist_del(&q->list, &q->list.plist); | 2167 | plist_del(&q->list, &hb->chain); |
2171 | 2168 | ||
2172 | /* Handle spurious wakeups gracefully */ | 2169 | /* Handle spurious wakeups gracefully */ |
2173 | ret = -EWOULDBLOCK; | 2170 | ret = -EWOULDBLOCK; |
@@ -2463,11 +2460,20 @@ retry: | |||
2463 | * userspace. | 2460 | * userspace. |
2464 | */ | 2461 | */ |
2465 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; | 2462 | mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; |
2466 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); | 2463 | /* |
2467 | 2464 | * We are not holding a lock here, but we want to have | |
2468 | if (nval == -EFAULT) | 2465 | * the pagefault_disable/enable() protection because |
2469 | return -1; | 2466 | * we want to handle the fault gracefully. If the |
2470 | 2467 | * access fails we try to fault in the futex with R/W | |
2468 | * verification via get_user_pages. get_user() above | ||
2469 | * does not guarantee R/W access. If that fails we | ||
2470 | * give up and leave the futex locked. | ||
2471 | */ | ||
2472 | if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { | ||
2473 | if (fault_in_user_writeable(uaddr)) | ||
2474 | return -1; | ||
2475 | goto retry; | ||
2476 | } | ||
2471 | if (nval != uval) | 2477 | if (nval != uval) |
2472 | goto retry; | 2478 | goto retry; |
2473 | 2479 | ||
@@ -2678,8 +2684,7 @@ static int __init futex_init(void) | |||
2678 | * implementation, the non-functional ones will return | 2684 | * implementation, the non-functional ones will return |
2679 | * -ENOSYS. | 2685 | * -ENOSYS. |
2680 | */ | 2686 | */ |
2681 | curval = cmpxchg_futex_value_locked(NULL, 0, 0); | 2687 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) |
2682 | if (curval == -EFAULT) | ||
2683 | futex_cmpxchg_enabled = 1; | 2688 | futex_cmpxchg_enabled = 1; |
2684 | 2689 | ||
2685 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 2690 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |