aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c236
1 files changed, 197 insertions, 39 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index f6ff0191ecf7..08ec814ad9d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,14 +63,101 @@
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h> 64#include <linux/hugetlb.h>
65#include <linux/freezer.h> 65#include <linux/freezer.h>
66#include <linux/bootmem.h>
66 67
67#include <asm/futex.h> 68#include <asm/futex.h>
68 69
69#include "locking/rtmutex_common.h" 70#include "locking/rtmutex_common.h"
70 71
71int __read_mostly futex_cmpxchg_enabled; 72/*
73 * Basic futex operation and ordering guarantees:
74 *
75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not changed
79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * and schedules.
81 *
82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This function computes the hash bucket and acquires the
84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * bucket and wakes them.
86 *
87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
92 *
93 * CPU 0 CPU 1
94 * val = *futex;
95 * sys_futex(WAIT, futex, val);
96 * futex_wait(futex, val);
97 * uval = *futex;
98 * *futex = newval;
99 * sys_futex(WAKE, futex);
100 * futex_wake(futex);
101 * if (queue_empty())
102 * return;
103 * if (uval == val)
104 * lock(hash_bucket(futex));
105 * queue();
106 * unlock(hash_bucket(futex));
107 * schedule();
108 *
109 * This would cause the waiter on CPU 0 to wait forever because it
110 * missed the transition of the user space value from val to newval
111 * and the waker did not find the waiter in the hash bucket queue.
112 *
113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
118 * val = *futex;
119 * sys_futex(WAIT, futex, val);
120 * futex_wait(futex, val);
121 *
122 * waiters++;
123 * mb(); (A) <-- paired with -.
124 * |
125 * lock(hash_bucket(futex)); |
126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
133 * if (uval == val)
134 * queue();
135 * unlock(hash_bucket(futex));
136 * schedule(); if (waiters)
137 * lock(hash_bucket(futex));
138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
158 */
72 159
73#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 160int __read_mostly futex_cmpxchg_enabled;
74 161
75/* 162/*
76 * Futex flags used to encode options to functions and preserve them across 163 * Futex flags used to encode options to functions and preserve them across
@@ -147,11 +234,59 @@ static const struct futex_q futex_q_init = {
147 * waiting on a futex. 234 * waiting on a futex.
148 */ 235 */
149struct futex_hash_bucket { 236struct futex_hash_bucket {
237 atomic_t waiters;
150 spinlock_t lock; 238 spinlock_t lock;
151 struct plist_head chain; 239 struct plist_head chain;
152}; 240} ____cacheline_aligned_in_smp;
241
242static unsigned long __read_mostly futex_hashsize;
243
244static struct futex_hash_bucket *futex_queues;
245
246static inline void futex_get_mm(union futex_key *key)
247{
248 atomic_inc(&key->private.mm->mm_count);
249 /*
250 * Ensure futex_get_mm() implies a full barrier such that
251 * get_futex_key() implies a full barrier. This is relied upon
252 * as full barrier (B), see the ordering comment above.
253 */
254 smp_mb__after_atomic_inc();
255}
256
257/*
258 * Reflects a new waiter being added to the waitqueue.
259 */
260static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
261{
262#ifdef CONFIG_SMP
263 atomic_inc(&hb->waiters);
264 /*
265 * Full barrier (A), see the ordering comment above.
266 */
267 smp_mb__after_atomic_inc();
268#endif
269}
153 270
154static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 271/*
272 * Reflects a waiter being removed from the waitqueue by wakeup
273 * paths.
274 */
275static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
276{
277#ifdef CONFIG_SMP
278 atomic_dec(&hb->waiters);
279#endif
280}
281
282static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
283{
284#ifdef CONFIG_SMP
285 return atomic_read(&hb->waiters);
286#else
287 return 1;
288#endif
289}
155 290
156/* 291/*
157 * We hash on the keys returned from get_futex_key (see below). 292 * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +296,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
161 u32 hash = jhash2((u32*)&key->both.word, 296 u32 hash = jhash2((u32*)&key->both.word,
162 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 297 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
163 key->both.offset); 298 key->both.offset);
164 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 299 return &futex_queues[hash & (futex_hashsize - 1)];
165} 300}
166 301
167/* 302/*
@@ -187,10 +322,10 @@ static void get_futex_key_refs(union futex_key *key)
187 322
188 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 323 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
189 case FUT_OFF_INODE: 324 case FUT_OFF_INODE:
190 ihold(key->shared.inode); 325 ihold(key->shared.inode); /* implies MB (B) */
191 break; 326 break;
192 case FUT_OFF_MMSHARED: 327 case FUT_OFF_MMSHARED:
193 atomic_inc(&key->private.mm->mm_count); 328 futex_get_mm(key); /* implies MB (B) */
194 break; 329 break;
195 } 330 }
196} 331}
@@ -264,7 +399,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
264 if (!fshared) { 399 if (!fshared) {
265 key->private.mm = mm; 400 key->private.mm = mm;
266 key->private.address = address; 401 key->private.address = address;
267 get_futex_key_refs(key); 402 get_futex_key_refs(key); /* implies MB (B) */
268 return 0; 403 return 0;
269 } 404 }
270 405
@@ -371,7 +506,7 @@ again:
371 key->shared.pgoff = basepage_index(page); 506 key->shared.pgoff = basepage_index(page);
372 } 507 }
373 508
374 get_futex_key_refs(key); 509 get_futex_key_refs(key); /* implies MB (B) */
375 510
376out: 511out:
377 unlock_page(page_head); 512 unlock_page(page_head);
@@ -598,13 +733,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
598{ 733{
599 struct futex_pi_state *pi_state = NULL; 734 struct futex_pi_state *pi_state = NULL;
600 struct futex_q *this, *next; 735 struct futex_q *this, *next;
601 struct plist_head *head;
602 struct task_struct *p; 736 struct task_struct *p;
603 pid_t pid = uval & FUTEX_TID_MASK; 737 pid_t pid = uval & FUTEX_TID_MASK;
604 738
605 head = &hb->chain; 739 plist_for_each_entry_safe(this, next, &hb->chain, list) {
606
607 plist_for_each_entry_safe(this, next, head, list) {
608 if (match_futex(&this->key, key)) { 740 if (match_futex(&this->key, key)) {
609 /* 741 /*
610 * Another waiter already exists - bump up 742 * Another waiter already exists - bump up
@@ -838,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q)
838 970
839 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); 971 hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
840 plist_del(&q->list, &hb->chain); 972 plist_del(&q->list, &hb->chain);
973 hb_waiters_dec(hb);
841} 974}
842 975
843/* 976/*
@@ -986,7 +1119,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
986{ 1119{
987 struct futex_hash_bucket *hb; 1120 struct futex_hash_bucket *hb;
988 struct futex_q *this, *next; 1121 struct futex_q *this, *next;
989 struct plist_head *head;
990 union futex_key key = FUTEX_KEY_INIT; 1122 union futex_key key = FUTEX_KEY_INIT;
991 int ret; 1123 int ret;
992 1124
@@ -998,10 +1130,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
998 goto out; 1130 goto out;
999 1131
1000 hb = hash_futex(&key); 1132 hb = hash_futex(&key);
1133
1134 /* Make sure we really have tasks to wakeup */
1135 if (!hb_waiters_pending(hb))
1136 goto out_put_key;
1137
1001 spin_lock(&hb->lock); 1138 spin_lock(&hb->lock);
1002 head = &hb->chain;
1003 1139
1004 plist_for_each_entry_safe(this, next, head, list) { 1140 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1005 if (match_futex (&this->key, &key)) { 1141 if (match_futex (&this->key, &key)) {
1006 if (this->pi_state || this->rt_waiter) { 1142 if (this->pi_state || this->rt_waiter) {
1007 ret = -EINVAL; 1143 ret = -EINVAL;
@@ -1019,6 +1155,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1019 } 1155 }
1020 1156
1021 spin_unlock(&hb->lock); 1157 spin_unlock(&hb->lock);
1158out_put_key:
1022 put_futex_key(&key); 1159 put_futex_key(&key);
1023out: 1160out:
1024 return ret; 1161 return ret;
@@ -1034,7 +1171,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1034{ 1171{
1035 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1172 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1036 struct futex_hash_bucket *hb1, *hb2; 1173 struct futex_hash_bucket *hb1, *hb2;
1037 struct plist_head *head;
1038 struct futex_q *this, *next; 1174 struct futex_q *this, *next;
1039 int ret, op_ret; 1175 int ret, op_ret;
1040 1176
@@ -1082,9 +1218,7 @@ retry_private:
1082 goto retry; 1218 goto retry;
1083 } 1219 }
1084 1220
1085 head = &hb1->chain; 1221 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1086
1087 plist_for_each_entry_safe(this, next, head, list) {
1088 if (match_futex (&this->key, &key1)) { 1222 if (match_futex (&this->key, &key1)) {
1089 if (this->pi_state || this->rt_waiter) { 1223 if (this->pi_state || this->rt_waiter) {
1090 ret = -EINVAL; 1224 ret = -EINVAL;
@@ -1097,10 +1231,8 @@ retry_private:
1097 } 1231 }
1098 1232
1099 if (op_ret > 0) { 1233 if (op_ret > 0) {
1100 head = &hb2->chain;
1101
1102 op_ret = 0; 1234 op_ret = 0;
1103 plist_for_each_entry_safe(this, next, head, list) { 1235 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1104 if (match_futex (&this->key, &key2)) { 1236 if (match_futex (&this->key, &key2)) {
1105 if (this->pi_state || this->rt_waiter) { 1237 if (this->pi_state || this->rt_waiter) {
1106 ret = -EINVAL; 1238 ret = -EINVAL;
@@ -1142,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1142 */ 1274 */
1143 if (likely(&hb1->chain != &hb2->chain)) { 1275 if (likely(&hb1->chain != &hb2->chain)) {
1144 plist_del(&q->list, &hb1->chain); 1276 plist_del(&q->list, &hb1->chain);
1277 hb_waiters_dec(hb1);
1145 plist_add(&q->list, &hb2->chain); 1278 plist_add(&q->list, &hb2->chain);
1279 hb_waiters_inc(hb2);
1146 q->lock_ptr = &hb2->lock; 1280 q->lock_ptr = &hb2->lock;
1147 } 1281 }
1148 get_futex_key_refs(key2); 1282 get_futex_key_refs(key2);
@@ -1270,7 +1404,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1270 int drop_count = 0, task_count = 0, ret; 1404 int drop_count = 0, task_count = 0, ret;
1271 struct futex_pi_state *pi_state = NULL; 1405 struct futex_pi_state *pi_state = NULL;
1272 struct futex_hash_bucket *hb1, *hb2; 1406 struct futex_hash_bucket *hb1, *hb2;
1273 struct plist_head *head1;
1274 struct futex_q *this, *next; 1407 struct futex_q *this, *next;
1275 u32 curval2; 1408 u32 curval2;
1276 1409
@@ -1393,8 +1526,7 @@ retry_private:
1393 } 1526 }
1394 } 1527 }
1395 1528
1396 head1 = &hb1->chain; 1529 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1397 plist_for_each_entry_safe(this, next, head1, list) {
1398 if (task_count - nr_wake >= nr_requeue) 1530 if (task_count - nr_wake >= nr_requeue)
1399 break; 1531 break;
1400 1532
@@ -1487,17 +1619,29 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1487 struct futex_hash_bucket *hb; 1619 struct futex_hash_bucket *hb;
1488 1620
1489 hb = hash_futex(&q->key); 1621 hb = hash_futex(&q->key);
1622
1623 /*
1624 * Increment the counter before taking the lock so that
1625 * a potential waker won't miss a to-be-slept task that is
1626 * waiting for the spinlock. This is safe as all queue_lock()
1627 * users end up calling queue_me(). Similarly, for housekeeping,
1628 * decrement the counter at queue_unlock() when some error has
1629 * occurred and we don't end up adding the task to the list.
1630 */
1631 hb_waiters_inc(hb);
1632
1490 q->lock_ptr = &hb->lock; 1633 q->lock_ptr = &hb->lock;
1491 1634
1492 spin_lock(&hb->lock); 1635 spin_lock(&hb->lock); /* implies MB (A) */
1493 return hb; 1636 return hb;
1494} 1637}
1495 1638
1496static inline void 1639static inline void
1497queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1640queue_unlock(struct futex_hash_bucket *hb)
1498 __releases(&hb->lock) 1641 __releases(&hb->lock)
1499{ 1642{
1500 spin_unlock(&hb->lock); 1643 spin_unlock(&hb->lock);
1644 hb_waiters_dec(hb);
1501} 1645}
1502 1646
1503/** 1647/**
@@ -1867,7 +2011,7 @@ retry_private:
1867 ret = get_futex_value_locked(&uval, uaddr); 2011 ret = get_futex_value_locked(&uval, uaddr);
1868 2012
1869 if (ret) { 2013 if (ret) {
1870 queue_unlock(q, *hb); 2014 queue_unlock(*hb);
1871 2015
1872 ret = get_user(uval, uaddr); 2016 ret = get_user(uval, uaddr);
1873 if (ret) 2017 if (ret)
@@ -1881,7 +2025,7 @@ retry_private:
1881 } 2025 }
1882 2026
1883 if (uval != val) { 2027 if (uval != val) {
1884 queue_unlock(q, *hb); 2028 queue_unlock(*hb);
1885 ret = -EWOULDBLOCK; 2029 ret = -EWOULDBLOCK;
1886 } 2030 }
1887 2031
@@ -2029,7 +2173,7 @@ retry_private:
2029 * Task is exiting and we just wait for the 2173 * Task is exiting and we just wait for the
2030 * exit to complete. 2174 * exit to complete.
2031 */ 2175 */
2032 queue_unlock(&q, hb); 2176 queue_unlock(hb);
2033 put_futex_key(&q.key); 2177 put_futex_key(&q.key);
2034 cond_resched(); 2178 cond_resched();
2035 goto retry; 2179 goto retry;
@@ -2081,7 +2225,7 @@ retry_private:
2081 goto out_put_key; 2225 goto out_put_key;
2082 2226
2083out_unlock_put_key: 2227out_unlock_put_key:
2084 queue_unlock(&q, hb); 2228 queue_unlock(hb);
2085 2229
2086out_put_key: 2230out_put_key:
2087 put_futex_key(&q.key); 2231 put_futex_key(&q.key);
@@ -2091,7 +2235,7 @@ out:
2091 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2235 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2092 2236
2093uaddr_faulted: 2237uaddr_faulted:
2094 queue_unlock(&q, hb); 2238 queue_unlock(hb);
2095 2239
2096 ret = fault_in_user_writeable(uaddr); 2240 ret = fault_in_user_writeable(uaddr);
2097 if (ret) 2241 if (ret)
@@ -2113,7 +2257,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2113{ 2257{
2114 struct futex_hash_bucket *hb; 2258 struct futex_hash_bucket *hb;
2115 struct futex_q *this, *next; 2259 struct futex_q *this, *next;
2116 struct plist_head *head;
2117 union futex_key key = FUTEX_KEY_INIT; 2260 union futex_key key = FUTEX_KEY_INIT;
2118 u32 uval, vpid = task_pid_vnr(current); 2261 u32 uval, vpid = task_pid_vnr(current);
2119 int ret; 2262 int ret;
@@ -2153,9 +2296,7 @@ retry:
2153 * Ok, other tasks may need to be woken up - check waiters 2296 * Ok, other tasks may need to be woken up - check waiters
2154 * and do the wakeup if necessary: 2297 * and do the wakeup if necessary:
2155 */ 2298 */
2156 head = &hb->chain; 2299 plist_for_each_entry_safe(this, next, &hb->chain, list) {
2157
2158 plist_for_each_entry_safe(this, next, head, list) {
2159 if (!match_futex (&this->key, &key)) 2300 if (!match_futex (&this->key, &key))
2160 continue; 2301 continue;
2161 ret = wake_futex_pi(uaddr, uval, this); 2302 ret = wake_futex_pi(uaddr, uval, this);
@@ -2232,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2232 * Unqueue the futex_q and determine which it was. 2373 * Unqueue the futex_q and determine which it was.
2233 */ 2374 */
2234 plist_del(&q->list, &hb->chain); 2375 plist_del(&q->list, &hb->chain);
2376 hb_waiters_dec(hb);
2235 2377
2236 /* Handle spurious wakeups gracefully */ 2378 /* Handle spurious wakeups gracefully */
2237 ret = -EWOULDBLOCK; 2379 ret = -EWOULDBLOCK;
@@ -2316,6 +2458,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2316 * code while we sleep on uaddr. 2458 * code while we sleep on uaddr.
2317 */ 2459 */
2318 debug_rt_mutex_init_waiter(&rt_waiter); 2460 debug_rt_mutex_init_waiter(&rt_waiter);
2461 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2462 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2319 rt_waiter.task = NULL; 2463 rt_waiter.task = NULL;
2320 2464
2321 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2465 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2734,8 +2878,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2734static int __init futex_init(void) 2878static int __init futex_init(void)
2735{ 2879{
2736 u32 curval; 2880 u32 curval;
2737 int i; 2881 unsigned int futex_shift;
2882 unsigned long i;
2883
2884#if CONFIG_BASE_SMALL
2885 futex_hashsize = 16;
2886#else
2887 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2888#endif
2738 2889
2890 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2891 futex_hashsize, 0,
2892 futex_hashsize < 256 ? HASH_SMALL : 0,
2893 &futex_shift, NULL,
2894 futex_hashsize, futex_hashsize);
2895 futex_hashsize = 1UL << futex_shift;
2739 /* 2896 /*
2740 * This will fail and we want it. Some arch implementations do 2897 * This will fail and we want it. Some arch implementations do
2741 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2898 * runtime detection of the futex_atomic_cmpxchg_inatomic()
@@ -2749,7 +2906,8 @@ static int __init futex_init(void)
2749 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 2906 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2750 futex_cmpxchg_enabled = 1; 2907 futex_cmpxchg_enabled = 1;
2751 2908
2752 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2909 for (i = 0; i < futex_hashsize; i++) {
2910 atomic_set(&futex_queues[i].waiters, 0);
2753 plist_head_init(&futex_queues[i].chain); 2911 plist_head_init(&futex_queues[i].chain);
2754 spin_lock_init(&futex_queues[i].lock); 2912 spin_lock_init(&futex_queues[i].lock);
2755 } 2913 }