aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c203
1 files changed, 164 insertions, 39 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index f6ff0191ecf7..44a1261cb9ff 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -63,14 +63,101 @@
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/hugetlb.h> 64#include <linux/hugetlb.h>
65#include <linux/freezer.h> 65#include <linux/freezer.h>
66#include <linux/bootmem.h>
66 67
67#include <asm/futex.h> 68#include <asm/futex.h>
68 69
69#include "locking/rtmutex_common.h" 70#include "locking/rtmutex_common.h"
70 71
71int __read_mostly futex_cmpxchg_enabled; 72/*
73 * Basic futex operation and ordering guarantees:
74 *
75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not changed
79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * and schedules.
81 *
82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This function computes the hash bucket and acquires the
84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * bucket and wakes them.
86 *
87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
92 *
93 * CPU 0 CPU 1
94 * val = *futex;
95 * sys_futex(WAIT, futex, val);
96 * futex_wait(futex, val);
97 * uval = *futex;
98 * *futex = newval;
99 * sys_futex(WAKE, futex);
100 * futex_wake(futex);
101 * if (queue_empty())
102 * return;
103 * if (uval == val)
104 * lock(hash_bucket(futex));
105 * queue();
106 * unlock(hash_bucket(futex));
107 * schedule();
108 *
109 * This would cause the waiter on CPU 0 to wait forever because it
110 * missed the transition of the user space value from val to newval
111 * and the waker did not find the waiter in the hash bucket queue.
112 *
113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
118 * val = *futex;
119 * sys_futex(WAIT, futex, val);
120 * futex_wait(futex, val);
121 *
122 * waiters++;
123 * mb(); (A) <-- paired with -.
124 * |
125 * lock(hash_bucket(futex)); |
126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
133 * if (uval == val)
134 * queue();
135 * unlock(hash_bucket(futex));
136 * schedule(); if (waiters)
137 * lock(hash_bucket(futex));
138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
158 */
72 159
73#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 160int __read_mostly futex_cmpxchg_enabled;
74 161
75/* 162/*
76 * Futex flags used to encode options to functions and preserve them across 163 * Futex flags used to encode options to functions and preserve them across
@@ -149,9 +236,41 @@ static const struct futex_q futex_q_init = {
149struct futex_hash_bucket { 236struct futex_hash_bucket {
150 spinlock_t lock; 237 spinlock_t lock;
151 struct plist_head chain; 238 struct plist_head chain;
152}; 239} ____cacheline_aligned_in_smp;
153 240
154static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 241static unsigned long __read_mostly futex_hashsize;
242
243static struct futex_hash_bucket *futex_queues;
244
245static inline void futex_get_mm(union futex_key *key)
246{
247 atomic_inc(&key->private.mm->mm_count);
248 /*
249 * Ensure futex_get_mm() implies a full barrier such that
250 * get_futex_key() implies a full barrier. This is relied upon
251 * as full barrier (B), see the ordering comment above.
252 */
253 smp_mb__after_atomic_inc();
254}
255
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
257{
258#ifdef CONFIG_SMP
259 /*
260 * Tasks trying to enter the critical region are most likely
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */
265 if (spin_is_locked(&hb->lock))
266 return true;
267 smp_rmb(); /* Make sure we check the lock state first */
268
269 return !plist_head_empty(&hb->chain);
270#else
271 return true;
272#endif
273}
155 274
156/* 275/*
157 * We hash on the keys returned from get_futex_key (see below). 276 * We hash on the keys returned from get_futex_key (see below).
@@ -161,7 +280,7 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
161 u32 hash = jhash2((u32*)&key->both.word, 280 u32 hash = jhash2((u32*)&key->both.word,
162 (sizeof(key->both.word)+sizeof(key->both.ptr))/4, 281 (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
163 key->both.offset); 282 key->both.offset);
164 return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; 283 return &futex_queues[hash & (futex_hashsize - 1)];
165} 284}
166 285
167/* 286/*
@@ -187,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
187 306
188 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 307 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
189 case FUT_OFF_INODE: 308 case FUT_OFF_INODE:
190 ihold(key->shared.inode); 309 ihold(key->shared.inode); /* implies MB (B) */
191 break; 310 break;
192 case FUT_OFF_MMSHARED: 311 case FUT_OFF_MMSHARED:
193 atomic_inc(&key->private.mm->mm_count); 312 futex_get_mm(key); /* implies MB (B) */
194 break; 313 break;
195 } 314 }
196} 315}
@@ -264,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
264 if (!fshared) { 383 if (!fshared) {
265 key->private.mm = mm; 384 key->private.mm = mm;
266 key->private.address = address; 385 key->private.address = address;
267 get_futex_key_refs(key); 386 get_futex_key_refs(key); /* implies MB (B) */
268 return 0; 387 return 0;
269 } 388 }
270 389
@@ -371,7 +490,7 @@ again:
371 key->shared.pgoff = basepage_index(page); 490 key->shared.pgoff = basepage_index(page);
372 } 491 }
373 492
374 get_futex_key_refs(key); 493 get_futex_key_refs(key); /* implies MB (B) */
375 494
376out: 495out:
377 unlock_page(page_head); 496 unlock_page(page_head);
@@ -598,13 +717,10 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
598{ 717{
599 struct futex_pi_state *pi_state = NULL; 718 struct futex_pi_state *pi_state = NULL;
600 struct futex_q *this, *next; 719 struct futex_q *this, *next;
601 struct plist_head *head;
602 struct task_struct *p; 720 struct task_struct *p;
603 pid_t pid = uval & FUTEX_TID_MASK; 721 pid_t pid = uval & FUTEX_TID_MASK;
604 722
605 head = &hb->chain; 723 plist_for_each_entry_safe(this, next, &hb->chain, list) {
606
607 plist_for_each_entry_safe(this, next, head, list) {
608 if (match_futex(&this->key, key)) { 724 if (match_futex(&this->key, key)) {
609 /* 725 /*
610 * Another waiter already exists - bump up 726 * Another waiter already exists - bump up
@@ -986,7 +1102,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
986{ 1102{
987 struct futex_hash_bucket *hb; 1103 struct futex_hash_bucket *hb;
988 struct futex_q *this, *next; 1104 struct futex_q *this, *next;
989 struct plist_head *head;
990 union futex_key key = FUTEX_KEY_INIT; 1105 union futex_key key = FUTEX_KEY_INIT;
991 int ret; 1106 int ret;
992 1107
@@ -998,10 +1113,14 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
998 goto out; 1113 goto out;
999 1114
1000 hb = hash_futex(&key); 1115 hb = hash_futex(&key);
1116
1117 /* Make sure we really have tasks to wakeup */
1118 if (!hb_waiters_pending(hb))
1119 goto out_put_key;
1120
1001 spin_lock(&hb->lock); 1121 spin_lock(&hb->lock);
1002 head = &hb->chain;
1003 1122
1004 plist_for_each_entry_safe(this, next, head, list) { 1123 plist_for_each_entry_safe(this, next, &hb->chain, list) {
1005 if (match_futex (&this->key, &key)) { 1124 if (match_futex (&this->key, &key)) {
1006 if (this->pi_state || this->rt_waiter) { 1125 if (this->pi_state || this->rt_waiter) {
1007 ret = -EINVAL; 1126 ret = -EINVAL;
@@ -1019,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1019 } 1138 }
1020 1139
1021 spin_unlock(&hb->lock); 1140 spin_unlock(&hb->lock);
1141out_put_key:
1022 put_futex_key(&key); 1142 put_futex_key(&key);
1023out: 1143out:
1024 return ret; 1144 return ret;
@@ -1034,7 +1154,6 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1034{ 1154{
1035 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; 1155 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1036 struct futex_hash_bucket *hb1, *hb2; 1156 struct futex_hash_bucket *hb1, *hb2;
1037 struct plist_head *head;
1038 struct futex_q *this, *next; 1157 struct futex_q *this, *next;
1039 int ret, op_ret; 1158 int ret, op_ret;
1040 1159
@@ -1082,9 +1201,7 @@ retry_private:
1082 goto retry; 1201 goto retry;
1083 } 1202 }
1084 1203
1085 head = &hb1->chain; 1204 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1086
1087 plist_for_each_entry_safe(this, next, head, list) {
1088 if (match_futex (&this->key, &key1)) { 1205 if (match_futex (&this->key, &key1)) {
1089 if (this->pi_state || this->rt_waiter) { 1206 if (this->pi_state || this->rt_waiter) {
1090 ret = -EINVAL; 1207 ret = -EINVAL;
@@ -1097,10 +1214,8 @@ retry_private:
1097 } 1214 }
1098 1215
1099 if (op_ret > 0) { 1216 if (op_ret > 0) {
1100 head = &hb2->chain;
1101
1102 op_ret = 0; 1217 op_ret = 0;
1103 plist_for_each_entry_safe(this, next, head, list) { 1218 plist_for_each_entry_safe(this, next, &hb2->chain, list) {
1104 if (match_futex (&this->key, &key2)) { 1219 if (match_futex (&this->key, &key2)) {
1105 if (this->pi_state || this->rt_waiter) { 1220 if (this->pi_state || this->rt_waiter) {
1106 ret = -EINVAL; 1221 ret = -EINVAL;
@@ -1270,7 +1385,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1270 int drop_count = 0, task_count = 0, ret; 1385 int drop_count = 0, task_count = 0, ret;
1271 struct futex_pi_state *pi_state = NULL; 1386 struct futex_pi_state *pi_state = NULL;
1272 struct futex_hash_bucket *hb1, *hb2; 1387 struct futex_hash_bucket *hb1, *hb2;
1273 struct plist_head *head1;
1274 struct futex_q *this, *next; 1388 struct futex_q *this, *next;
1275 u32 curval2; 1389 u32 curval2;
1276 1390
@@ -1393,8 +1507,7 @@ retry_private:
1393 } 1507 }
1394 } 1508 }
1395 1509
1396 head1 = &hb1->chain; 1510 plist_for_each_entry_safe(this, next, &hb1->chain, list) {
1397 plist_for_each_entry_safe(this, next, head1, list) {
1398 if (task_count - nr_wake >= nr_requeue) 1511 if (task_count - nr_wake >= nr_requeue)
1399 break; 1512 break;
1400 1513
@@ -1489,12 +1602,12 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1489 hb = hash_futex(&q->key); 1602 hb = hash_futex(&q->key);
1490 q->lock_ptr = &hb->lock; 1603 q->lock_ptr = &hb->lock;
1491 1604
1492 spin_lock(&hb->lock); 1605 spin_lock(&hb->lock); /* implies MB (A) */
1493 return hb; 1606 return hb;
1494} 1607}
1495 1608
1496static inline void 1609static inline void
1497queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1610queue_unlock(struct futex_hash_bucket *hb)
1498 __releases(&hb->lock) 1611 __releases(&hb->lock)
1499{ 1612{
1500 spin_unlock(&hb->lock); 1613 spin_unlock(&hb->lock);
@@ -1867,7 +1980,7 @@ retry_private:
1867 ret = get_futex_value_locked(&uval, uaddr); 1980 ret = get_futex_value_locked(&uval, uaddr);
1868 1981
1869 if (ret) { 1982 if (ret) {
1870 queue_unlock(q, *hb); 1983 queue_unlock(*hb);
1871 1984
1872 ret = get_user(uval, uaddr); 1985 ret = get_user(uval, uaddr);
1873 if (ret) 1986 if (ret)
@@ -1881,7 +1994,7 @@ retry_private:
1881 } 1994 }
1882 1995
1883 if (uval != val) { 1996 if (uval != val) {
1884 queue_unlock(q, *hb); 1997 queue_unlock(*hb);
1885 ret = -EWOULDBLOCK; 1998 ret = -EWOULDBLOCK;
1886 } 1999 }
1887 2000
@@ -2029,7 +2142,7 @@ retry_private:
2029 * Task is exiting and we just wait for the 2142 * Task is exiting and we just wait for the
2030 * exit to complete. 2143 * exit to complete.
2031 */ 2144 */
2032 queue_unlock(&q, hb); 2145 queue_unlock(hb);
2033 put_futex_key(&q.key); 2146 put_futex_key(&q.key);
2034 cond_resched(); 2147 cond_resched();
2035 goto retry; 2148 goto retry;
@@ -2081,7 +2194,7 @@ retry_private:
2081 goto out_put_key; 2194 goto out_put_key;
2082 2195
2083out_unlock_put_key: 2196out_unlock_put_key:
2084 queue_unlock(&q, hb); 2197 queue_unlock(hb);
2085 2198
2086out_put_key: 2199out_put_key:
2087 put_futex_key(&q.key); 2200 put_futex_key(&q.key);
@@ -2091,7 +2204,7 @@ out:
2091 return ret != -EINTR ? ret : -ERESTARTNOINTR; 2204 return ret != -EINTR ? ret : -ERESTARTNOINTR;
2092 2205
2093uaddr_faulted: 2206uaddr_faulted:
2094 queue_unlock(&q, hb); 2207 queue_unlock(hb);
2095 2208
2096 ret = fault_in_user_writeable(uaddr); 2209 ret = fault_in_user_writeable(uaddr);
2097 if (ret) 2210 if (ret)
@@ -2113,7 +2226,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2113{ 2226{
2114 struct futex_hash_bucket *hb; 2227 struct futex_hash_bucket *hb;
2115 struct futex_q *this, *next; 2228 struct futex_q *this, *next;
2116 struct plist_head *head;
2117 union futex_key key = FUTEX_KEY_INIT; 2229 union futex_key key = FUTEX_KEY_INIT;
2118 u32 uval, vpid = task_pid_vnr(current); 2230 u32 uval, vpid = task_pid_vnr(current);
2119 int ret; 2231 int ret;
@@ -2153,9 +2265,7 @@ retry:
2153 * Ok, other tasks may need to be woken up - check waiters 2265 * Ok, other tasks may need to be woken up - check waiters
2154 * and do the wakeup if necessary: 2266 * and do the wakeup if necessary:
2155 */ 2267 */
2156 head = &hb->chain; 2268 plist_for_each_entry_safe(this, next, &hb->chain, list) {
2157
2158 plist_for_each_entry_safe(this, next, head, list) {
2159 if (!match_futex (&this->key, &key)) 2269 if (!match_futex (&this->key, &key))
2160 continue; 2270 continue;
2161 ret = wake_futex_pi(uaddr, uval, this); 2271 ret = wake_futex_pi(uaddr, uval, this);
@@ -2316,6 +2426,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2316 * code while we sleep on uaddr. 2426 * code while we sleep on uaddr.
2317 */ 2427 */
2318 debug_rt_mutex_init_waiter(&rt_waiter); 2428 debug_rt_mutex_init_waiter(&rt_waiter);
2429 RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
2430 RB_CLEAR_NODE(&rt_waiter.tree_entry);
2319 rt_waiter.task = NULL; 2431 rt_waiter.task = NULL;
2320 2432
2321 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); 2433 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -2734,8 +2846,21 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2734static int __init futex_init(void) 2846static int __init futex_init(void)
2735{ 2847{
2736 u32 curval; 2848 u32 curval;
2737 int i; 2849 unsigned int futex_shift;
2850 unsigned long i;
2851
2852#if CONFIG_BASE_SMALL
2853 futex_hashsize = 16;
2854#else
2855 futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
2856#endif
2738 2857
2858 futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
2859 futex_hashsize, 0,
2860 futex_hashsize < 256 ? HASH_SMALL : 0,
2861 &futex_shift, NULL,
2862 futex_hashsize, futex_hashsize);
2863 futex_hashsize = 1UL << futex_shift;
2739 /* 2864 /*
2740 * This will fail and we want it. Some arch implementations do 2865 * This will fail and we want it. Some arch implementations do
2741 * runtime detection of the futex_atomic_cmpxchg_inatomic() 2866 * runtime detection of the futex_atomic_cmpxchg_inatomic()
@@ -2749,7 +2874,7 @@ static int __init futex_init(void)
2749 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) 2874 if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2750 futex_cmpxchg_enabled = 1; 2875 futex_cmpxchg_enabled = 1;
2751 2876
2752 for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { 2877 for (i = 0; i < futex_hashsize; i++) {
2753 plist_head_init(&futex_queues[i].chain); 2878 plist_head_init(&futex_queues[i].chain);
2754 spin_lock_init(&futex_queues[i].lock); 2879 spin_lock_init(&futex_queues[i].lock);
2755 } 2880 }