aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c243
1 files changed, 188 insertions, 55 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..b632b5f3f094 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key)
267 * get_futex_key() implies a full barrier. This is relied upon 267 * get_futex_key() implies a full barrier. This is relied upon
268 * as full barrier (B), see the ordering comment above. 268 * as full barrier (B), see the ordering comment above.
269 */ 269 */
270 smp_mb__after_atomic_inc(); 270 smp_mb__after_atomic();
271} 271}
272 272
273/* 273/*
@@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
280 /* 280 /*
281 * Full barrier (A), see the ordering comment above. 281 * Full barrier (A), see the ordering comment above.
282 */ 282 */
283 smp_mb__after_atomic_inc(); 283 smp_mb__after_atomic();
284#endif 284#endif
285} 285}
286 286
@@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr)
743 raw_spin_unlock_irq(&curr->pi_lock); 743 raw_spin_unlock_irq(&curr->pi_lock);
744} 744}
745 745
746/*
747 * We need to check the following states:
748 *
749 * Waiter | pi_state | pi->owner | uTID | uODIED | ?
750 *
751 * [1] NULL | --- | --- | 0 | 0/1 | Valid
752 * [2] NULL | --- | --- | >0 | 0/1 | Valid
753 *
754 * [3] Found | NULL | -- | Any | 0/1 | Invalid
755 *
756 * [4] Found | Found | NULL | 0 | 1 | Valid
757 * [5] Found | Found | NULL | >0 | 1 | Invalid
758 *
759 * [6] Found | Found | task | 0 | 1 | Valid
760 *
761 * [7] Found | Found | NULL | Any | 0 | Invalid
762 *
763 * [8] Found | Found | task | ==taskTID | 0/1 | Valid
764 * [9] Found | Found | task | 0 | 0 | Invalid
765 * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
766 *
767 * [1] Indicates that the kernel can acquire the futex atomically. We
768 * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
769 *
770 * [2] Valid, if TID does not belong to a kernel thread. If no matching
771 * thread is found then it indicates that the owner TID has died.
772 *
773 * [3] Invalid. The waiter is queued on a non PI futex
774 *
775 * [4] Valid state after exit_robust_list(), which sets the user space
776 * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
777 *
778 * [5] The user space value got manipulated between exit_robust_list()
779 * and exit_pi_state_list()
780 *
781 * [6] Valid state after exit_pi_state_list() which sets the new owner in
782 * the pi_state but cannot access the user space value.
783 *
784 * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
785 *
786 * [8] Owner and user space value match
787 *
788 * [9] There is no transient state which sets the user space TID to 0
789 * except exit_robust_list(), but this is indicated by the
790 * FUTEX_OWNER_DIED bit. See [4]
791 *
792 * [10] There is no transient state which leaves owner and user space
793 * TID out of sync.
794 */
746static int 795static int
747lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, 796lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
748 union futex_key *key, struct futex_pi_state **ps) 797 union futex_key *key, struct futex_pi_state **ps)
@@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
755 plist_for_each_entry_safe(this, next, &hb->chain, list) { 804 plist_for_each_entry_safe(this, next, &hb->chain, list) {
756 if (match_futex(&this->key, key)) { 805 if (match_futex(&this->key, key)) {
757 /* 806 /*
758 * Another waiter already exists - bump up 807 * Sanity check the waiter before increasing
759 * the refcount and return its pi_state: 808 * the refcount and attaching to it.
760 */ 809 */
761 pi_state = this->pi_state; 810 pi_state = this->pi_state;
762 /* 811 /*
763 * Userspace might have messed up non-PI and PI futexes 812 * Userspace might have messed up non-PI and
813 * PI futexes [3]
764 */ 814 */
765 if (unlikely(!pi_state)) 815 if (unlikely(!pi_state))
766 return -EINVAL; 816 return -EINVAL;
@@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
768 WARN_ON(!atomic_read(&pi_state->refcount)); 818 WARN_ON(!atomic_read(&pi_state->refcount));
769 819
770 /* 820 /*
771 * When pi_state->owner is NULL then the owner died 821 * Handle the owner died case:
772 * and another waiter is on the fly. pi_state->owner
773 * is fixed up by the task which acquires
774 * pi_state->rt_mutex.
775 *
776 * We do not check for pid == 0 which can happen when
777 * the owner died and robust_list_exit() cleared the
778 * TID.
779 */ 822 */
780 if (pid && pi_state->owner) { 823 if (uval & FUTEX_OWNER_DIED) {
824 /*
825 * exit_pi_state_list sets owner to NULL and
826 * wakes the topmost waiter. The task which
827 * acquires the pi_state->rt_mutex will fixup
828 * owner.
829 */
830 if (!pi_state->owner) {
831 /*
832 * No pi state owner, but the user
833 * space TID is not 0. Inconsistent
834 * state. [5]
835 */
836 if (pid)
837 return -EINVAL;
838 /*
839 * Take a ref on the state and
840 * return. [4]
841 */
842 goto out_state;
843 }
844
781 /* 845 /*
782 * Bail out if user space manipulated the 846 * If TID is 0, then either the dying owner
783 * futex value. 847 * has not yet executed exit_pi_state_list()
848 * or some waiter acquired the rtmutex in the
849 * pi state, but did not yet fixup the TID in
850 * user space.
851 *
852 * Take a ref on the state and return. [6]
784 */ 853 */
785 if (pid != task_pid_vnr(pi_state->owner)) 854 if (!pid)
855 goto out_state;
856 } else {
857 /*
858 * If the owner died bit is not set,
859 * then the pi_state must have an
860 * owner. [7]
861 */
862 if (!pi_state->owner)
786 return -EINVAL; 863 return -EINVAL;
787 } 864 }
788 865
866 /*
867 * Bail out if user space manipulated the
868 * futex value. If pi state exists then the
869 * owner TID must be the same as the user
870 * space TID. [9/10]
871 */
872 if (pid != task_pid_vnr(pi_state->owner))
873 return -EINVAL;
874
875 out_state:
789 atomic_inc(&pi_state->refcount); 876 atomic_inc(&pi_state->refcount);
790 *ps = pi_state; 877 *ps = pi_state;
791
792 return 0; 878 return 0;
793 } 879 }
794 } 880 }
795 881
796 /* 882 /*
797 * We are the first waiter - try to look up the real owner and attach 883 * We are the first waiter - try to look up the real owner and attach
798 * the new pi_state to it, but bail out when TID = 0 884 * the new pi_state to it, but bail out when TID = 0 [1]
799 */ 885 */
800 if (!pid) 886 if (!pid)
801 return -ESRCH; 887 return -ESRCH;
@@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
803 if (!p) 889 if (!p)
804 return -ESRCH; 890 return -ESRCH;
805 891
892 if (!p->mm) {
893 put_task_struct(p);
894 return -EPERM;
895 }
896
806 /* 897 /*
807 * We need to look at the task state flags to figure out, 898 * We need to look at the task state flags to figure out,
808 * whether the task is exiting. To protect against the do_exit 899 * whether the task is exiting. To protect against the do_exit
@@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
823 return ret; 914 return ret;
824 } 915 }
825 916
917 /*
918 * No existing pi state. First waiter. [2]
919 */
826 pi_state = alloc_pi_state(); 920 pi_state = alloc_pi_state();
827 921
828 /* 922 /*
@@ -894,10 +988,18 @@ retry:
894 return -EDEADLK; 988 return -EDEADLK;
895 989
896 /* 990 /*
897 * Surprise - we got the lock. Just return to userspace: 991 * Surprise - we got the lock, but we do not trust user space at all.
898 */ 992 */
899 if (unlikely(!curval)) 993 if (unlikely(!curval)) {
900 return 1; 994 /*
995 * We verify whether there is kernel state for this
996 * futex. If not, we can safely assume, that the 0 ->
997 * TID transition is correct. If state exists, we do
998 * not bother to fixup the user space state as it was
999 * corrupted already.
1000 */
1001 return futex_top_waiter(hb, key) ? -EINVAL : 1;
1002 }
901 1003
902 uval = curval; 1004 uval = curval;
903 1005
@@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1028 struct task_struct *new_owner; 1130 struct task_struct *new_owner;
1029 struct futex_pi_state *pi_state = this->pi_state; 1131 struct futex_pi_state *pi_state = this->pi_state;
1030 u32 uninitialized_var(curval), newval; 1132 u32 uninitialized_var(curval), newval;
1133 int ret = 0;
1031 1134
1032 if (!pi_state) 1135 if (!pi_state)
1033 return -EINVAL; 1136 return -EINVAL;
@@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
1051 new_owner = this->task; 1154 new_owner = this->task;
1052 1155
1053 /* 1156 /*
1054 * We pass it to the next owner. (The WAITERS bit is always 1157 * We pass it to the next owner. The WAITERS bit is always
1055 * kept enabled while there is PI state around. We must also 1158 * kept enabled while there is PI state around. We cleanup the
1056 * preserve the owner died bit.) 1159 * owner died bit, because we are the owner.
1057 */ 1160 */
1058 if (!(uval & FUTEX_OWNER_DIED)) { 1161 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1059 int ret = 0;
1060 1162
1061 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1163 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1062 1164 ret = -EFAULT;
1063 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1165 else if (curval != uval)
1064 ret = -EFAULT; 1166 ret = -EINVAL;
1065 else if (curval != uval) 1167 if (ret) {
1066 ret = -EINVAL; 1168 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1067 if (ret) { 1169 return ret;
1068 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1069 return ret;
1070 }
1071 } 1170 }
1072 1171
1073 raw_spin_lock_irq(&pi_state->owner->pi_lock); 1172 raw_spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1347 * 1446 *
1348 * Return: 1447 * Return:
1349 * 0 - failed to acquire the lock atomically; 1448 * 0 - failed to acquire the lock atomically;
1350 * 1 - acquired the lock; 1449 * >0 - acquired the lock, return value is vpid of the top_waiter
1351 * <0 - error 1450 * <0 - error
1352 */ 1451 */
1353static int futex_proxy_trylock_atomic(u32 __user *pifutex, 1452static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1358{ 1457{
1359 struct futex_q *top_waiter = NULL; 1458 struct futex_q *top_waiter = NULL;
1360 u32 curval; 1459 u32 curval;
1361 int ret; 1460 int ret, vpid;
1362 1461
1363 if (get_futex_value_locked(&curval, pifutex)) 1462 if (get_futex_value_locked(&curval, pifutex))
1364 return -EFAULT; 1463 return -EFAULT;
@@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1386 * the contended case or if set_waiters is 1. The pi_state is returned 1485 * the contended case or if set_waiters is 1. The pi_state is returned
1387 * in ps in contended cases. 1486 * in ps in contended cases.
1388 */ 1487 */
1488 vpid = task_pid_vnr(top_waiter->task);
1389 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1489 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1390 set_waiters); 1490 set_waiters);
1391 if (ret == 1) 1491 if (ret == 1) {
1392 requeue_pi_wake_futex(top_waiter, key2, hb2); 1492 requeue_pi_wake_futex(top_waiter, key2, hb2);
1393 1493 return vpid;
1494 }
1394 return ret; 1495 return ret;
1395} 1496}
1396 1497
@@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1421 struct futex_pi_state *pi_state = NULL; 1522 struct futex_pi_state *pi_state = NULL;
1422 struct futex_hash_bucket *hb1, *hb2; 1523 struct futex_hash_bucket *hb1, *hb2;
1423 struct futex_q *this, *next; 1524 struct futex_q *this, *next;
1424 u32 curval2;
1425 1525
1426 if (requeue_pi) { 1526 if (requeue_pi) {
1427 /* 1527 /*
1528 * Requeue PI only works on two distinct uaddrs. This
1529 * check is only valid for private futexes. See below.
1530 */
1531 if (uaddr1 == uaddr2)
1532 return -EINVAL;
1533
1534 /*
1428 * requeue_pi requires a pi_state, try to allocate it now 1535 * requeue_pi requires a pi_state, try to allocate it now
1429 * without any locks in case it fails. 1536 * without any locks in case it fails.
1430 */ 1537 */
@@ -1462,6 +1569,15 @@ retry:
1462 if (unlikely(ret != 0)) 1569 if (unlikely(ret != 0))
1463 goto out_put_key1; 1570 goto out_put_key1;
1464 1571
1572 /*
1573 * The check above which compares uaddrs is not sufficient for
1574 * shared futexes. We need to compare the keys:
1575 */
1576 if (requeue_pi && match_futex(&key1, &key2)) {
1577 ret = -EINVAL;
1578 goto out_put_keys;
1579 }
1580
1465 hb1 = hash_futex(&key1); 1581 hb1 = hash_futex(&key1);
1466 hb2 = hash_futex(&key2); 1582 hb2 = hash_futex(&key2);
1467 1583
@@ -1509,16 +1625,25 @@ retry_private:
1509 * At this point the top_waiter has either taken uaddr2 or is 1625 * At this point the top_waiter has either taken uaddr2 or is
1510 * waiting on it. If the former, then the pi_state will not 1626 * waiting on it. If the former, then the pi_state will not
1511 * exist yet, look it up one more time to ensure we have a 1627 * exist yet, look it up one more time to ensure we have a
1512 * reference to it. 1628 * reference to it. If the lock was taken, ret contains the
1629 * vpid of the top waiter task.
1513 */ 1630 */
1514 if (ret == 1) { 1631 if (ret > 0) {
1515 WARN_ON(pi_state); 1632 WARN_ON(pi_state);
1516 drop_count++; 1633 drop_count++;
1517 task_count++; 1634 task_count++;
1518 ret = get_futex_value_locked(&curval2, uaddr2); 1635 /*
1519 if (!ret) 1636 * If we acquired the lock, then the user
1520 ret = lookup_pi_state(curval2, hb2, &key2, 1637 * space value of uaddr2 should be vpid. It
1521 &pi_state); 1638 * cannot be changed by the top waiter as it
1639 * is blocked on hb2 lock if it tries to do
1640 * so. If something fiddled with it behind our
1641 * back the pi state lookup might unearth
1642 * it. So we rather use the known value than
1643 * rereading and handing potential crap to
1644 * lookup_pi_state.
1645 */
1646 ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1522 } 1647 }
1523 1648
1524 switch (ret) { 1649 switch (ret) {
@@ -2301,9 +2426,10 @@ retry:
2301 /* 2426 /*
2302 * To avoid races, try to do the TID -> 0 atomic transition 2427 * To avoid races, try to do the TID -> 0 atomic transition
2303 * again. If it succeeds then we can return without waking 2428 * again. If it succeeds then we can return without waking
2304 * anyone else up: 2429 * anyone else up. We only try this if neither the waiters nor
2430 * the owner died bit are set.
2305 */ 2431 */
2306 if (!(uval & FUTEX_OWNER_DIED) && 2432 if (!(uval & ~FUTEX_TID_MASK) &&
2307 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) 2433 cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2308 goto pi_faulted; 2434 goto pi_faulted;
2309 /* 2435 /*
@@ -2333,11 +2459,9 @@ retry:
2333 /* 2459 /*
2334 * No waiters - kernel unlocks the futex: 2460 * No waiters - kernel unlocks the futex:
2335 */ 2461 */
2336 if (!(uval & FUTEX_OWNER_DIED)) { 2462 ret = unlock_futex_pi(uaddr, uval);
2337 ret = unlock_futex_pi(uaddr, uval); 2463 if (ret == -EFAULT)
2338 if (ret == -EFAULT) 2464 goto pi_faulted;
2339 goto pi_faulted;
2340 }
2341 2465
2342out_unlock: 2466out_unlock:
2343 spin_unlock(&hb->lock); 2467 spin_unlock(&hb->lock);
@@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2499 if (ret) 2623 if (ret)
2500 goto out_key2; 2624 goto out_key2;
2501 2625
2626 /*
2627 * The check above which compares uaddrs is not sufficient for
2628 * shared futexes. We need to compare the keys:
2629 */
2630 if (match_futex(&q.key, &key2)) {
2631 ret = -EINVAL;
2632 goto out_put_keys;
2633 }
2634
2502 /* Queue the futex_q, drop the hb lock, wait for wakeup. */ 2635 /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2503 futex_wait_queue_me(hb, &q, to); 2636 futex_wait_queue_me(hb, &q, to);
2504 2637