diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 127 |
1 files changed, 95 insertions, 32 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index 44a1261cb9ff..5f589279e462 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -70,7 +70,10 @@ | |||
| 70 | #include "locking/rtmutex_common.h" | 70 | #include "locking/rtmutex_common.h" |
| 71 | 71 | ||
| 72 | /* | 72 | /* |
| 73 | * Basic futex operation and ordering guarantees: | 73 | * READ this before attempting to hack on futexes! |
| 74 | * | ||
| 75 | * Basic futex operation and ordering guarantees | ||
| 76 | * ============================================= | ||
| 74 | * | 77 | * |
| 75 | * The waiter reads the futex value in user space and calls | 78 | * The waiter reads the futex value in user space and calls |
| 76 | * futex_wait(). This function computes the hash bucket and acquires | 79 | * futex_wait(). This function computes the hash bucket and acquires |
| @@ -119,7 +122,7 @@ | |||
| 119 | * sys_futex(WAIT, futex, val); | 122 | * sys_futex(WAIT, futex, val); |
| 120 | * futex_wait(futex, val); | 123 | * futex_wait(futex, val); |
| 121 | * | 124 | * |
| 122 | * waiters++; | 125 | * waiters++; (a) |
| 123 | * mb(); (A) <-- paired with -. | 126 | * mb(); (A) <-- paired with -. |
| 124 | * | | 127 | * | |
| 125 | * lock(hash_bucket(futex)); | | 128 | * lock(hash_bucket(futex)); | |
| @@ -135,14 +138,14 @@ | |||
| 135 | * unlock(hash_bucket(futex)); | 138 | * unlock(hash_bucket(futex)); |
| 136 | * schedule(); if (waiters) | 139 | * schedule(); if (waiters) |
| 137 | * lock(hash_bucket(futex)); | 140 | * lock(hash_bucket(futex)); |
| 138 | * wake_waiters(futex); | 141 | * else wake_waiters(futex); |
| 139 | * unlock(hash_bucket(futex)); | 142 | * waiters--; (b) unlock(hash_bucket(futex)); |
| 140 | * | 143 | * |
| 141 | * Where (A) orders the waiters increment and the futex value read -- this | 144 | * Where (A) orders the waiters increment and the futex value read through |
| 142 | * is guaranteed by the head counter in the hb spinlock; and where (B) | 145 | * atomic operations (see hb_waiters_inc) and where (B) orders the write |
| 143 | * orders the write to futex and the waiters read -- this is done by the | 146 | * to futex and the waiters read -- this is done by the barriers in |
| 144 | * barriers in get_futex_key_refs(), through either ihold or atomic_inc, | 147 | * get_futex_key_refs(), through either ihold or atomic_inc, depending on the |
| 145 | * depending on the futex type. | 148 | * futex type. |
| 146 | * | 149 | * |
| 147 | * This yields the following case (where X:=waiters, Y:=futex): | 150 | * This yields the following case (where X:=waiters, Y:=futex): |
| 148 | * | 151 | * |
| @@ -155,9 +158,22 @@ | |||
| 155 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | 158 | * Which guarantees that x==0 && y==0 is impossible; which translates back into |
| 156 | * the guarantee that we cannot both miss the futex variable change and the | 159 | * the guarantee that we cannot both miss the futex variable change and the |
| 157 | * enqueue. | 160 | * enqueue. |
| 161 | * | ||
| 162 | * Note that a new waiter is accounted for in (a) even when it is possible that | ||
| 163 | * the wait call can return error, in which case we backtrack from it in (b). | ||
| 164 | * Refer to the comment in queue_lock(). | ||
| 165 | * | ||
| 166 | * Similarly, in order to account for waiters being requeued on another | ||
| 167 | * address we always increment the waiters for the destination bucket before | ||
| 168 | * acquiring the lock. It then decrements them again after releasing it - | ||
| 169 | * the code that actually moves the futex(es) between hash buckets (requeue_futex) | ||
| 170 | * will do the additional required waiter count housekeeping. This is done for | ||
| 171 | * double_lock_hb() and double_unlock_hb(), respectively. | ||
| 158 | */ | 172 | */ |
| 159 | 173 | ||
| 174 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG | ||
| 160 | int __read_mostly futex_cmpxchg_enabled; | 175 | int __read_mostly futex_cmpxchg_enabled; |
| 176 | #endif | ||
| 161 | 177 | ||
| 162 | /* | 178 | /* |
| 163 | * Futex flags used to encode options to functions and preserve them across | 179 | * Futex flags used to encode options to functions and preserve them across |
| @@ -234,6 +250,7 @@ static const struct futex_q futex_q_init = { | |||
| 234 | * waiting on a futex. | 250 | * waiting on a futex. |
| 235 | */ | 251 | */ |
| 236 | struct futex_hash_bucket { | 252 | struct futex_hash_bucket { |
| 253 | atomic_t waiters; | ||
| 237 | spinlock_t lock; | 254 | spinlock_t lock; |
| 238 | struct plist_head chain; | 255 | struct plist_head chain; |
| 239 | } ____cacheline_aligned_in_smp; | 256 | } ____cacheline_aligned_in_smp; |
| @@ -253,22 +270,37 @@ static inline void futex_get_mm(union futex_key *key) | |||
| 253 | smp_mb__after_atomic_inc(); | 270 | smp_mb__after_atomic_inc(); |
| 254 | } | 271 | } |
| 255 | 272 | ||
| 256 | static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) | 273 | /* |
| 274 | * Reflects a new waiter being added to the waitqueue. | ||
| 275 | */ | ||
| 276 | static inline void hb_waiters_inc(struct futex_hash_bucket *hb) | ||
| 257 | { | 277 | { |
| 258 | #ifdef CONFIG_SMP | 278 | #ifdef CONFIG_SMP |
| 279 | atomic_inc(&hb->waiters); | ||
| 259 | /* | 280 | /* |
| 260 | * Tasks trying to enter the critical region are most likely | 281 | * Full barrier (A), see the ordering comment above. |
| 261 | * potential waiters that will be added to the plist. Ensure | ||
| 262 | * that wakers won't miss to-be-slept tasks in the window between | ||
| 263 | * the wait call and the actual plist_add. | ||
| 264 | */ | 282 | */ |
| 265 | if (spin_is_locked(&hb->lock)) | 283 | smp_mb__after_atomic_inc(); |
| 266 | return true; | 284 | #endif |
| 267 | smp_rmb(); /* Make sure we check the lock state first */ | 285 | } |
| 268 | 286 | ||
| 269 | return !plist_head_empty(&hb->chain); | 287 | /* |
| 288 | * Reflects a waiter being removed from the waitqueue by wakeup | ||
| 289 | * paths. | ||
| 290 | */ | ||
| 291 | static inline void hb_waiters_dec(struct futex_hash_bucket *hb) | ||
| 292 | { | ||
| 293 | #ifdef CONFIG_SMP | ||
| 294 | atomic_dec(&hb->waiters); | ||
| 295 | #endif | ||
| 296 | } | ||
| 297 | |||
| 298 | static inline int hb_waiters_pending(struct futex_hash_bucket *hb) | ||
| 299 | { | ||
| 300 | #ifdef CONFIG_SMP | ||
| 301 | return atomic_read(&hb->waiters); | ||
| 270 | #else | 302 | #else |
| 271 | return true; | 303 | return 1; |
| 272 | #endif | 304 | #endif |
| 273 | } | 305 | } |
| 274 | 306 | ||
| @@ -954,6 +986,7 @@ static void __unqueue_futex(struct futex_q *q) | |||
| 954 | 986 | ||
| 955 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); | 987 | hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); |
| 956 | plist_del(&q->list, &hb->chain); | 988 | plist_del(&q->list, &hb->chain); |
| 989 | hb_waiters_dec(hb); | ||
| 957 | } | 990 | } |
| 958 | 991 | ||
| 959 | /* | 992 | /* |
| @@ -1257,7 +1290,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, | |||
| 1257 | */ | 1290 | */ |
| 1258 | if (likely(&hb1->chain != &hb2->chain)) { | 1291 | if (likely(&hb1->chain != &hb2->chain)) { |
| 1259 | plist_del(&q->list, &hb1->chain); | 1292 | plist_del(&q->list, &hb1->chain); |
| 1293 | hb_waiters_dec(hb1); | ||
| 1260 | plist_add(&q->list, &hb2->chain); | 1294 | plist_add(&q->list, &hb2->chain); |
| 1295 | hb_waiters_inc(hb2); | ||
| 1261 | q->lock_ptr = &hb2->lock; | 1296 | q->lock_ptr = &hb2->lock; |
| 1262 | } | 1297 | } |
| 1263 | get_futex_key_refs(key2); | 1298 | get_futex_key_refs(key2); |
| @@ -1431,6 +1466,7 @@ retry: | |||
| 1431 | hb2 = hash_futex(&key2); | 1466 | hb2 = hash_futex(&key2); |
| 1432 | 1467 | ||
| 1433 | retry_private: | 1468 | retry_private: |
| 1469 | hb_waiters_inc(hb2); | ||
| 1434 | double_lock_hb(hb1, hb2); | 1470 | double_lock_hb(hb1, hb2); |
| 1435 | 1471 | ||
| 1436 | if (likely(cmpval != NULL)) { | 1472 | if (likely(cmpval != NULL)) { |
| @@ -1440,6 +1476,7 @@ retry_private: | |||
| 1440 | 1476 | ||
| 1441 | if (unlikely(ret)) { | 1477 | if (unlikely(ret)) { |
| 1442 | double_unlock_hb(hb1, hb2); | 1478 | double_unlock_hb(hb1, hb2); |
| 1479 | hb_waiters_dec(hb2); | ||
| 1443 | 1480 | ||
| 1444 | ret = get_user(curval, uaddr1); | 1481 | ret = get_user(curval, uaddr1); |
| 1445 | if (ret) | 1482 | if (ret) |
| @@ -1489,6 +1526,7 @@ retry_private: | |||
| 1489 | break; | 1526 | break; |
| 1490 | case -EFAULT: | 1527 | case -EFAULT: |
| 1491 | double_unlock_hb(hb1, hb2); | 1528 | double_unlock_hb(hb1, hb2); |
| 1529 | hb_waiters_dec(hb2); | ||
| 1492 | put_futex_key(&key2); | 1530 | put_futex_key(&key2); |
| 1493 | put_futex_key(&key1); | 1531 | put_futex_key(&key1); |
| 1494 | ret = fault_in_user_writeable(uaddr2); | 1532 | ret = fault_in_user_writeable(uaddr2); |
| @@ -1498,6 +1536,7 @@ retry_private: | |||
| 1498 | case -EAGAIN: | 1536 | case -EAGAIN: |
| 1499 | /* The owner was exiting, try again. */ | 1537 | /* The owner was exiting, try again. */ |
| 1500 | double_unlock_hb(hb1, hb2); | 1538 | double_unlock_hb(hb1, hb2); |
| 1539 | hb_waiters_dec(hb2); | ||
| 1501 | put_futex_key(&key2); | 1540 | put_futex_key(&key2); |
| 1502 | put_futex_key(&key1); | 1541 | put_futex_key(&key1); |
| 1503 | cond_resched(); | 1542 | cond_resched(); |
| @@ -1573,6 +1612,7 @@ retry_private: | |||
| 1573 | 1612 | ||
| 1574 | out_unlock: | 1613 | out_unlock: |
| 1575 | double_unlock_hb(hb1, hb2); | 1614 | double_unlock_hb(hb1, hb2); |
| 1615 | hb_waiters_dec(hb2); | ||
| 1576 | 1616 | ||
| 1577 | /* | 1617 | /* |
| 1578 | * drop_futex_key_refs() must be called outside the spinlocks. During | 1618 | * drop_futex_key_refs() must be called outside the spinlocks. During |
| @@ -1600,6 +1640,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
| 1600 | struct futex_hash_bucket *hb; | 1640 | struct futex_hash_bucket *hb; |
| 1601 | 1641 | ||
| 1602 | hb = hash_futex(&q->key); | 1642 | hb = hash_futex(&q->key); |
| 1643 | |||
| 1644 | /* | ||
| 1645 | * Increment the counter before taking the lock so that | ||
| 1646 | * a potential waker won't miss a to-be-slept task that is | ||
| 1647 | * waiting for the spinlock. This is safe as all queue_lock() | ||
| 1648 | * users end up calling queue_me(). Similarly, for housekeeping, | ||
| 1649 | * decrement the counter at queue_unlock() when some error has | ||
| 1650 | * occurred and we don't end up adding the task to the list. | ||
| 1651 | */ | ||
| 1652 | hb_waiters_inc(hb); | ||
| 1653 | |||
| 1603 | q->lock_ptr = &hb->lock; | 1654 | q->lock_ptr = &hb->lock; |
| 1604 | 1655 | ||
| 1605 | spin_lock(&hb->lock); /* implies MB (A) */ | 1656 | spin_lock(&hb->lock); /* implies MB (A) */ |
| @@ -1611,6 +1662,7 @@ queue_unlock(struct futex_hash_bucket *hb) | |||
| 1611 | __releases(&hb->lock) | 1662 | __releases(&hb->lock) |
| 1612 | { | 1663 | { |
| 1613 | spin_unlock(&hb->lock); | 1664 | spin_unlock(&hb->lock); |
| 1665 | hb_waiters_dec(hb); | ||
| 1614 | } | 1666 | } |
| 1615 | 1667 | ||
| 1616 | /** | 1668 | /** |
| @@ -2342,6 +2394,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
| 2342 | * Unqueue the futex_q and determine which it was. | 2394 | * Unqueue the futex_q and determine which it was. |
| 2343 | */ | 2395 | */ |
| 2344 | plist_del(&q->list, &hb->chain); | 2396 | plist_del(&q->list, &hb->chain); |
| 2397 | hb_waiters_dec(hb); | ||
| 2345 | 2398 | ||
| 2346 | /* Handle spurious wakeups gracefully */ | 2399 | /* Handle spurious wakeups gracefully */ |
| 2347 | ret = -EWOULDBLOCK; | 2400 | ret = -EWOULDBLOCK; |
| @@ -2843,9 +2896,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, | |||
| 2843 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); | 2896 | return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); |
| 2844 | } | 2897 | } |
| 2845 | 2898 | ||
| 2846 | static int __init futex_init(void) | 2899 | static void __init futex_detect_cmpxchg(void) |
| 2847 | { | 2900 | { |
| 2901 | #ifndef CONFIG_HAVE_FUTEX_CMPXCHG | ||
| 2848 | u32 curval; | 2902 | u32 curval; |
| 2903 | |||
| 2904 | /* | ||
| 2905 | * This will fail and we want it. Some arch implementations do | ||
| 2906 | * runtime detection of the futex_atomic_cmpxchg_inatomic() | ||
| 2907 | * functionality. We want to know that before we call in any | ||
| 2908 | * of the complex code paths. Also we want to prevent | ||
| 2909 | * registration of robust lists in that case. NULL is | ||
| 2910 | * guaranteed to fault and we get -EFAULT on functional | ||
| 2911 | * implementation, the non-functional ones will return | ||
| 2912 | * -ENOSYS. | ||
| 2913 | */ | ||
| 2914 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) | ||
| 2915 | futex_cmpxchg_enabled = 1; | ||
| 2916 | #endif | ||
| 2917 | } | ||
| 2918 | |||
| 2919 | static int __init futex_init(void) | ||
| 2920 | { | ||
| 2849 | unsigned int futex_shift; | 2921 | unsigned int futex_shift; |
| 2850 | unsigned long i; | 2922 | unsigned long i; |
| 2851 | 2923 | ||
| @@ -2861,20 +2933,11 @@ static int __init futex_init(void) | |||
| 2861 | &futex_shift, NULL, | 2933 | &futex_shift, NULL, |
| 2862 | futex_hashsize, futex_hashsize); | 2934 | futex_hashsize, futex_hashsize); |
| 2863 | futex_hashsize = 1UL << futex_shift; | 2935 | futex_hashsize = 1UL << futex_shift; |
| 2864 | /* | 2936 | |
| 2865 | * This will fail and we want it. Some arch implementations do | 2937 | futex_detect_cmpxchg(); |
| 2866 | * runtime detection of the futex_atomic_cmpxchg_inatomic() | ||
| 2867 | * functionality. We want to know that before we call in any | ||
| 2868 | * of the complex code paths. Also we want to prevent | ||
| 2869 | * registration of robust lists in that case. NULL is | ||
| 2870 | * guaranteed to fault and we get -EFAULT on functional | ||
| 2871 | * implementation, the non-functional ones will return | ||
| 2872 | * -ENOSYS. | ||
| 2873 | */ | ||
| 2874 | if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) | ||
| 2875 | futex_cmpxchg_enabled = 1; | ||
| 2876 | 2938 | ||
| 2877 | for (i = 0; i < futex_hashsize; i++) { | 2939 | for (i = 0; i < futex_hashsize; i++) { |
| 2940 | atomic_set(&futex_queues[i].waiters, 0); | ||
| 2878 | plist_head_init(&futex_queues[i].chain); | 2941 | plist_head_init(&futex_queues[i].chain); |
| 2879 | spin_lock_init(&futex_queues[i].lock); | 2942 | spin_lock_init(&futex_queues[i].lock); |
| 2880 | } | 2943 | } |
