diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 243 |
1 files changed, 188 insertions, 55 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..b632b5f3f094 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -267,7 +267,7 @@ static inline void futex_get_mm(union futex_key *key) | |||
| 267 | * get_futex_key() implies a full barrier. This is relied upon | 267 | * get_futex_key() implies a full barrier. This is relied upon |
| 268 | * as full barrier (B), see the ordering comment above. | 268 | * as full barrier (B), see the ordering comment above. |
| 269 | */ | 269 | */ |
| 270 | smp_mb__after_atomic_inc(); | 270 | smp_mb__after_atomic(); |
| 271 | } | 271 | } |
| 272 | 272 | ||
| 273 | /* | 273 | /* |
| @@ -280,7 +280,7 @@ static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | |||
| 280 | /* | 280 | /* |
| 281 | * Full barrier (A), see the ordering comment above. | 281 | * Full barrier (A), see the ordering comment above. |
| 282 | */ | 282 | */ |
| 283 | smp_mb__after_atomic_inc(); | 283 | smp_mb__after_atomic(); |
| 284 | #endif | 284 | #endif |
| 285 | } | 285 | } |
| 286 | 286 | ||
| @@ -743,6 +743,55 @@ void exit_pi_state_list(struct task_struct *curr) | |||
| 743 | raw_spin_unlock_irq(&curr->pi_lock); | 743 | raw_spin_unlock_irq(&curr->pi_lock); |
| 744 | } | 744 | } |
| 745 | 745 | ||
| 746 | /* | ||
| 747 | * We need to check the following states: | ||
| 748 | * | ||
| 749 | * Waiter | pi_state | pi->owner | uTID | uODIED | ? | ||
| 750 | * | ||
| 751 | * [1] NULL | --- | --- | 0 | 0/1 | Valid | ||
| 752 | * [2] NULL | --- | --- | >0 | 0/1 | Valid | ||
| 753 | * | ||
| 754 | * [3] Found | NULL | -- | Any | 0/1 | Invalid | ||
| 755 | * | ||
| 756 | * [4] Found | Found | NULL | 0 | 1 | Valid | ||
| 757 | * [5] Found | Found | NULL | >0 | 1 | Invalid | ||
| 758 | * | ||
| 759 | * [6] Found | Found | task | 0 | 1 | Valid | ||
| 760 | * | ||
| 761 | * [7] Found | Found | NULL | Any | 0 | Invalid | ||
| 762 | * | ||
| 763 | * [8] Found | Found | task | ==taskTID | 0/1 | Valid | ||
| 764 | * [9] Found | Found | task | 0 | 0 | Invalid | ||
| 765 | * [10] Found | Found | task | !=taskTID | 0/1 | Invalid | ||
| 766 | * | ||
| 767 | * [1] Indicates that the kernel can acquire the futex atomically. We | ||
| 768 | * came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit. | ||
| 769 | * | ||
| 770 | * [2] Valid, if TID does not belong to a kernel thread. If no matching | ||
| 771 | * thread is found then it indicates that the owner TID has died. | ||
| 772 | * | ||
| 773 | * [3] Invalid. The waiter is queued on a non PI futex | ||
| 774 | * | ||
| 775 | * [4] Valid state after exit_robust_list(), which sets the user space | ||
| 776 | * value to FUTEX_WAITERS | FUTEX_OWNER_DIED. | ||
| 777 | * | ||
| 778 | * [5] The user space value got manipulated between exit_robust_list() | ||
| 779 | * and exit_pi_state_list() | ||
| 780 | * | ||
| 781 | * [6] Valid state after exit_pi_state_list() which sets the new owner in | ||
| 782 | * the pi_state but cannot access the user space value. | ||
| 783 | * | ||
| 784 | * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set. | ||
| 785 | * | ||
| 786 | * [8] Owner and user space value match | ||
| 787 | * | ||
| 788 | * [9] There is no transient state which sets the user space TID to 0 | ||
| 789 | * except exit_robust_list(), but this is indicated by the | ||
| 790 | * FUTEX_OWNER_DIED bit. See [4] | ||
| 791 | * | ||
| 792 | * [10] There is no transient state which leaves owner and user space | ||
| 793 | * TID out of sync. | ||
| 794 | */ | ||
| 746 | static int | 795 | static int |
| 747 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | 796 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, |
| 748 | union futex_key *key, struct futex_pi_state **ps) | 797 | union futex_key *key, struct futex_pi_state **ps) |
| @@ -755,12 +804,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 755 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 804 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
| 756 | if (match_futex(&this->key, key)) { | 805 | if (match_futex(&this->key, key)) { |
| 757 | /* | 806 | /* |
| 758 | * Another waiter already exists - bump up | 807 | * Sanity check the waiter before increasing |
| 759 | * the refcount and return its pi_state: | 808 | * the refcount and attaching to it. |
| 760 | */ | 809 | */ |
| 761 | pi_state = this->pi_state; | 810 | pi_state = this->pi_state; |
| 762 | /* | 811 | /* |
| 763 | * Userspace might have messed up non-PI and PI futexes | 812 | * Userspace might have messed up non-PI and |
| 813 | * PI futexes [3] | ||
| 764 | */ | 814 | */ |
| 765 | if (unlikely(!pi_state)) | 815 | if (unlikely(!pi_state)) |
| 766 | return -EINVAL; | 816 | return -EINVAL; |
| @@ -768,34 +818,70 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 768 | WARN_ON(!atomic_read(&pi_state->refcount)); | 818 | WARN_ON(!atomic_read(&pi_state->refcount)); |
| 769 | 819 | ||
| 770 | /* | 820 | /* |
| 771 | * When pi_state->owner is NULL then the owner died | 821 | * Handle the owner died case: |
| 772 | * and another waiter is on the fly. pi_state->owner | ||
| 773 | * is fixed up by the task which acquires | ||
| 774 | * pi_state->rt_mutex. | ||
| 775 | * | ||
| 776 | * We do not check for pid == 0 which can happen when | ||
| 777 | * the owner died and robust_list_exit() cleared the | ||
| 778 | * TID. | ||
| 779 | */ | 822 | */ |
| 780 | if (pid && pi_state->owner) { | 823 | if (uval & FUTEX_OWNER_DIED) { |
| 824 | /* | ||
| 825 | * exit_pi_state_list sets owner to NULL and | ||
| 826 | * wakes the topmost waiter. The task which | ||
| 827 | * acquires the pi_state->rt_mutex will fixup | ||
| 828 | * owner. | ||
| 829 | */ | ||
| 830 | if (!pi_state->owner) { | ||
| 831 | /* | ||
| 832 | * No pi state owner, but the user | ||
| 833 | * space TID is not 0. Inconsistent | ||
| 834 | * state. [5] | ||
| 835 | */ | ||
| 836 | if (pid) | ||
| 837 | return -EINVAL; | ||
| 838 | /* | ||
| 839 | * Take a ref on the state and | ||
| 840 | * return. [4] | ||
| 841 | */ | ||
| 842 | goto out_state; | ||
| 843 | } | ||
| 844 | |||
| 781 | /* | 845 | /* |
| 782 | * Bail out if user space manipulated the | 846 | * If TID is 0, then either the dying owner |
| 783 | * futex value. | 847 | * has not yet executed exit_pi_state_list() |
| 848 | * or some waiter acquired the rtmutex in the | ||
| 849 | * pi state, but did not yet fixup the TID in | ||
| 850 | * user space. | ||
| 851 | * | ||
| 852 | * Take a ref on the state and return. [6] | ||
| 784 | */ | 853 | */ |
| 785 | if (pid != task_pid_vnr(pi_state->owner)) | 854 | if (!pid) |
| 855 | goto out_state; | ||
| 856 | } else { | ||
| 857 | /* | ||
| 858 | * If the owner died bit is not set, | ||
| 859 | * then the pi_state must have an | ||
| 860 | * owner. [7] | ||
| 861 | */ | ||
| 862 | if (!pi_state->owner) | ||
| 786 | return -EINVAL; | 863 | return -EINVAL; |
| 787 | } | 864 | } |
| 788 | 865 | ||
| 866 | /* | ||
| 867 | * Bail out if user space manipulated the | ||
| 868 | * futex value. If pi state exists then the | ||
| 869 | * owner TID must be the same as the user | ||
| 870 | * space TID. [9/10] | ||
| 871 | */ | ||
| 872 | if (pid != task_pid_vnr(pi_state->owner)) | ||
| 873 | return -EINVAL; | ||
| 874 | |||
| 875 | out_state: | ||
| 789 | atomic_inc(&pi_state->refcount); | 876 | atomic_inc(&pi_state->refcount); |
| 790 | *ps = pi_state; | 877 | *ps = pi_state; |
| 791 | |||
| 792 | return 0; | 878 | return 0; |
| 793 | } | 879 | } |
| 794 | } | 880 | } |
| 795 | 881 | ||
| 796 | /* | 882 | /* |
| 797 | * We are the first waiter - try to look up the real owner and attach | 883 | * We are the first waiter - try to look up the real owner and attach |
| 798 | * the new pi_state to it, but bail out when TID = 0 | 884 | * the new pi_state to it, but bail out when TID = 0 [1] |
| 799 | */ | 885 | */ |
| 800 | if (!pid) | 886 | if (!pid) |
| 801 | return -ESRCH; | 887 | return -ESRCH; |
| @@ -803,6 +889,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 803 | if (!p) | 889 | if (!p) |
| 804 | return -ESRCH; | 890 | return -ESRCH; |
| 805 | 891 | ||
| 892 | if (!p->mm) { | ||
| 893 | put_task_struct(p); | ||
| 894 | return -EPERM; | ||
| 895 | } | ||
| 896 | |||
| 806 | /* | 897 | /* |
| 807 | * We need to look at the task state flags to figure out, | 898 | * We need to look at the task state flags to figure out, |
| 808 | * whether the task is exiting. To protect against the do_exit | 899 | * whether the task is exiting. To protect against the do_exit |
| @@ -823,6 +914,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, | |||
| 823 | return ret; | 914 | return ret; |
| 824 | } | 915 | } |
| 825 | 916 | ||
| 917 | /* | ||
| 918 | * No existing pi state. First waiter. [2] | ||
| 919 | */ | ||
| 826 | pi_state = alloc_pi_state(); | 920 | pi_state = alloc_pi_state(); |
| 827 | 921 | ||
| 828 | /* | 922 | /* |
| @@ -894,10 +988,18 @@ retry: | |||
| 894 | return -EDEADLK; | 988 | return -EDEADLK; |
| 895 | 989 | ||
| 896 | /* | 990 | /* |
| 897 | * Surprise - we got the lock. Just return to userspace: | 991 | * Surprise - we got the lock, but we do not trust user space at all. |
| 898 | */ | 992 | */ |
| 899 | if (unlikely(!curval)) | 993 | if (unlikely(!curval)) { |
| 900 | return 1; | 994 | /* |
| 995 | * We verify whether there is kernel state for this | ||
| 996 | * futex. If not, we can safely assume, that the 0 -> | ||
| 997 | * TID transition is correct. If state exists, we do | ||
| 998 | * not bother to fixup the user space state as it was | ||
| 999 | * corrupted already. | ||
| 1000 | */ | ||
| 1001 | return futex_top_waiter(hb, key) ? -EINVAL : 1; | ||
| 1002 | } | ||
| 901 | 1003 | ||
| 902 | uval = curval; | 1004 | uval = curval; |
| 903 | 1005 | ||
| @@ -1028,6 +1130,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 1028 | struct task_struct *new_owner; | 1130 | struct task_struct *new_owner; |
| 1029 | struct futex_pi_state *pi_state = this->pi_state; | 1131 | struct futex_pi_state *pi_state = this->pi_state; |
| 1030 | u32 uninitialized_var(curval), newval; | 1132 | u32 uninitialized_var(curval), newval; |
| 1133 | int ret = 0; | ||
| 1031 | 1134 | ||
| 1032 | if (!pi_state) | 1135 | if (!pi_state) |
| 1033 | return -EINVAL; | 1136 | return -EINVAL; |
| @@ -1051,23 +1154,19 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 1051 | new_owner = this->task; | 1154 | new_owner = this->task; |
| 1052 | 1155 | ||
| 1053 | /* | 1156 | /* |
| 1054 | * We pass it to the next owner. (The WAITERS bit is always | 1157 | * We pass it to the next owner. The WAITERS bit is always |
| 1055 | * kept enabled while there is PI state around. We must also | 1158 | * kept enabled while there is PI state around. We cleanup the |
| 1056 | * preserve the owner died bit.) | 1159 | * owner died bit, because we are the owner. |
| 1057 | */ | 1160 | */ |
| 1058 | if (!(uval & FUTEX_OWNER_DIED)) { | 1161 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); |
| 1059 | int ret = 0; | ||
| 1060 | 1162 | ||
| 1061 | newval = FUTEX_WAITERS | task_pid_vnr(new_owner); | 1163 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) |
| 1062 | 1164 | ret = -EFAULT; | |
| 1063 | if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) | 1165 | else if (curval != uval) |
| 1064 | ret = -EFAULT; | 1166 | ret = -EINVAL; |
| 1065 | else if (curval != uval) | 1167 | if (ret) { |
| 1066 | ret = -EINVAL; | 1168 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); |
| 1067 | if (ret) { | 1169 | return ret; |
| 1068 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | ||
| 1069 | return ret; | ||
| 1070 | } | ||
| 1071 | } | 1170 | } |
| 1072 | 1171 | ||
| 1073 | raw_spin_lock_irq(&pi_state->owner->pi_lock); | 1172 | raw_spin_lock_irq(&pi_state->owner->pi_lock); |
| @@ -1347,7 +1446,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | |||
| 1347 | * | 1446 | * |
| 1348 | * Return: | 1447 | * Return: |
| 1349 | * 0 - failed to acquire the lock atomically; | 1448 | * 0 - failed to acquire the lock atomically; |
| 1350 | * 1 - acquired the lock; | 1449 | * >0 - acquired the lock, return value is vpid of the top_waiter |
| 1351 | * <0 - error | 1450 | * <0 - error |
| 1352 | */ | 1451 | */ |
| 1353 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, | 1452 | static int futex_proxy_trylock_atomic(u32 __user *pifutex, |
| @@ -1358,7 +1457,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1358 | { | 1457 | { |
| 1359 | struct futex_q *top_waiter = NULL; | 1458 | struct futex_q *top_waiter = NULL; |
| 1360 | u32 curval; | 1459 | u32 curval; |
| 1361 | int ret; | 1460 | int ret, vpid; |
| 1362 | 1461 | ||
| 1363 | if (get_futex_value_locked(&curval, pifutex)) | 1462 | if (get_futex_value_locked(&curval, pifutex)) |
| 1364 | return -EFAULT; | 1463 | return -EFAULT; |
| @@ -1386,11 +1485,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, | |||
| 1386 | * the contended case or if set_waiters is 1. The pi_state is returned | 1485 | * the contended case or if set_waiters is 1. The pi_state is returned |
| 1387 | * in ps in contended cases. | 1486 | * in ps in contended cases. |
| 1388 | */ | 1487 | */ |
| 1488 | vpid = task_pid_vnr(top_waiter->task); | ||
| 1389 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, | 1489 | ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, |
| 1390 | set_waiters); | 1490 | set_waiters); |
| 1391 | if (ret == 1) | 1491 | if (ret == 1) { |
| 1392 | requeue_pi_wake_futex(top_waiter, key2, hb2); | 1492 | requeue_pi_wake_futex(top_waiter, key2, hb2); |
| 1393 | 1493 | return vpid; | |
| 1494 | } | ||
| 1394 | return ret; | 1495 | return ret; |
| 1395 | } | 1496 | } |
| 1396 | 1497 | ||
| @@ -1421,10 +1522,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, | |||
| 1421 | struct futex_pi_state *pi_state = NULL; | 1522 | struct futex_pi_state *pi_state = NULL; |
| 1422 | struct futex_hash_bucket *hb1, *hb2; | 1523 | struct futex_hash_bucket *hb1, *hb2; |
| 1423 | struct futex_q *this, *next; | 1524 | struct futex_q *this, *next; |
| 1424 | u32 curval2; | ||
| 1425 | 1525 | ||
| 1426 | if (requeue_pi) { | 1526 | if (requeue_pi) { |
| 1427 | /* | 1527 | /* |
| 1528 | * Requeue PI only works on two distinct uaddrs. This | ||
| 1529 | * check is only valid for private futexes. See below. | ||
| 1530 | */ | ||
| 1531 | if (uaddr1 == uaddr2) | ||
| 1532 | return -EINVAL; | ||
| 1533 | |||
| 1534 | /* | ||
| 1428 | * requeue_pi requires a pi_state, try to allocate it now | 1535 | * requeue_pi requires a pi_state, try to allocate it now |
| 1429 | * without any locks in case it fails. | 1536 | * without any locks in case it fails. |
| 1430 | */ | 1537 | */ |
| @@ -1462,6 +1569,15 @@ retry: | |||
| 1462 | if (unlikely(ret != 0)) | 1569 | if (unlikely(ret != 0)) |
| 1463 | goto out_put_key1; | 1570 | goto out_put_key1; |
| 1464 | 1571 | ||
| 1572 | /* | ||
| 1573 | * The check above which compares uaddrs is not sufficient for | ||
| 1574 | * shared futexes. We need to compare the keys: | ||
| 1575 | */ | ||
| 1576 | if (requeue_pi && match_futex(&key1, &key2)) { | ||
| 1577 | ret = -EINVAL; | ||
| 1578 | goto out_put_keys; | ||
| 1579 | } | ||
| 1580 | |||
| 1465 | hb1 = hash_futex(&key1); | 1581 | hb1 = hash_futex(&key1); |
| 1466 | hb2 = hash_futex(&key2); | 1582 | hb2 = hash_futex(&key2); |
| 1467 | 1583 | ||
| @@ -1509,16 +1625,25 @@ retry_private: | |||
| 1509 | * At this point the top_waiter has either taken uaddr2 or is | 1625 | * At this point the top_waiter has either taken uaddr2 or is |
| 1510 | * waiting on it. If the former, then the pi_state will not | 1626 | * waiting on it. If the former, then the pi_state will not |
| 1511 | * exist yet, look it up one more time to ensure we have a | 1627 | * exist yet, look it up one more time to ensure we have a |
| 1512 | * reference to it. | 1628 | * reference to it. If the lock was taken, ret contains the |
| 1629 | * vpid of the top waiter task. | ||
| 1513 | */ | 1630 | */ |
| 1514 | if (ret == 1) { | 1631 | if (ret > 0) { |
| 1515 | WARN_ON(pi_state); | 1632 | WARN_ON(pi_state); |
| 1516 | drop_count++; | 1633 | drop_count++; |
| 1517 | task_count++; | 1634 | task_count++; |
| 1518 | ret = get_futex_value_locked(&curval2, uaddr2); | 1635 | /* |
| 1519 | if (!ret) | 1636 | * If we acquired the lock, then the user |
| 1520 | ret = lookup_pi_state(curval2, hb2, &key2, | 1637 | * space value of uaddr2 should be vpid. It |
| 1521 | &pi_state); | 1638 | * cannot be changed by the top waiter as it |
| 1639 | * is blocked on hb2 lock if it tries to do | ||
| 1640 | * so. If something fiddled with it behind our | ||
| 1641 | * back the pi state lookup might unearth | ||
| 1642 | * it. So we rather use the known value than | ||
| 1643 | * rereading and handing potential crap to | ||
| 1644 | * lookup_pi_state. | ||
| 1645 | */ | ||
| 1646 | ret = lookup_pi_state(ret, hb2, &key2, &pi_state); | ||
| 1522 | } | 1647 | } |
| 1523 | 1648 | ||
| 1524 | switch (ret) { | 1649 | switch (ret) { |
| @@ -2301,9 +2426,10 @@ retry: | |||
| 2301 | /* | 2426 | /* |
| 2302 | * To avoid races, try to do the TID -> 0 atomic transition | 2427 | * To avoid races, try to do the TID -> 0 atomic transition |
| 2303 | * again. If it succeeds then we can return without waking | 2428 | * again. If it succeeds then we can return without waking |
| 2304 | * anyone else up: | 2429 | * anyone else up. We only try this if neither the waiters nor |
| 2430 | * the owner died bit are set. | ||
| 2305 | */ | 2431 | */ |
| 2306 | if (!(uval & FUTEX_OWNER_DIED) && | 2432 | if (!(uval & ~FUTEX_TID_MASK) && |
| 2307 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) | 2433 | cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) |
| 2308 | goto pi_faulted; | 2434 | goto pi_faulted; |
| 2309 | /* | 2435 | /* |
| @@ -2333,11 +2459,9 @@ retry: | |||
| 2333 | /* | 2459 | /* |
| 2334 | * No waiters - kernel unlocks the futex: | 2460 | * No waiters - kernel unlocks the futex: |
| 2335 | */ | 2461 | */ |
| 2336 | if (!(uval & FUTEX_OWNER_DIED)) { | 2462 | ret = unlock_futex_pi(uaddr, uval); |
| 2337 | ret = unlock_futex_pi(uaddr, uval); | 2463 | if (ret == -EFAULT) |
| 2338 | if (ret == -EFAULT) | 2464 | goto pi_faulted; |
| 2339 | goto pi_faulted; | ||
| 2340 | } | ||
| 2341 | 2465 | ||
| 2342 | out_unlock: | 2466 | out_unlock: |
| 2343 | spin_unlock(&hb->lock); | 2467 | spin_unlock(&hb->lock); |
| @@ -2499,6 +2623,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2499 | if (ret) | 2623 | if (ret) |
| 2500 | goto out_key2; | 2624 | goto out_key2; |
| 2501 | 2625 | ||
| 2626 | /* | ||
| 2627 | * The check above which compares uaddrs is not sufficient for | ||
| 2628 | * shared futexes. We need to compare the keys: | ||
| 2629 | */ | ||
| 2630 | if (match_futex(&q.key, &key2)) { | ||
| 2631 | ret = -EINVAL; | ||
| 2632 | goto out_put_keys; | ||
| 2633 | } | ||
| 2634 | |||
| 2502 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ | 2635 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
| 2503 | futex_wait_queue_me(hb, &q, to); | 2636 | futex_wait_queue_me(hb, &q, to); |
| 2504 | 2637 | ||
