diff options
Diffstat (limited to 'kernel/futex.c')
-rw-r--r-- | kernel/futex.c | 117 |
1 files changed, 92 insertions, 25 deletions
diff --git a/kernel/futex.c b/kernel/futex.c index fcc6850483fb..30971b5c0e2d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -75,17 +75,20 @@ | |||
75 | * The waiter reads the futex value in user space and calls | 75 | * The waiter reads the futex value in user space and calls |
76 | * futex_wait(). This function computes the hash bucket and acquires | 76 | * futex_wait(). This function computes the hash bucket and acquires |
77 | * the hash bucket lock. After that it reads the futex user space value | 77 | * the hash bucket lock. After that it reads the futex user space value |
78 | * again and verifies that the data has not changed. If it has not | 78 | * again and verifies that the data has not changed. If it has not changed |
79 | * changed it enqueues itself into the hash bucket, releases the hash | 79 | * it enqueues itself into the hash bucket, releases the hash bucket lock |
80 | * bucket lock and schedules. | 80 | * and schedules. |
81 | * | 81 | * |
82 | * The waker side modifies the user space value of the futex and calls | 82 | * The waker side modifies the user space value of the futex and calls |
83 | * futex_wake(). This functions computes the hash bucket and acquires | 83 | * futex_wake(). This function computes the hash bucket and acquires the |
84 | * the hash bucket lock. Then it looks for waiters on that futex in the | 84 | * hash bucket lock. Then it looks for waiters on that futex in the hash |
85 | * hash bucket and wakes them. | 85 | * bucket and wakes them. |
86 | * | 86 | * |
87 | * Note that the spin_lock serializes waiters and wakers, so that the | 87 | * In futex wake up scenarios where no tasks are blocked on a futex, taking |
88 | * following scenario is avoided: | 88 | * the hb spinlock can be avoided and simply return. In order for this |
89 | * optimization to work, ordering guarantees must exist so that the waiter | ||
90 | * being added to the list is acknowledged when the list is concurrently being | ||
91 | * checked by the waker, avoiding scenarios like the following: | ||
89 | * | 92 | * |
90 | * CPU 0 CPU 1 | 93 | * CPU 0 CPU 1 |
91 | * val = *futex; | 94 | * val = *futex; |
@@ -106,24 +109,52 @@ | |||
106 | * This would cause the waiter on CPU 0 to wait forever because it | 109 | * This would cause the waiter on CPU 0 to wait forever because it |
107 | * missed the transition of the user space value from val to newval | 110 | * missed the transition of the user space value from val to newval |
108 | * and the waker did not find the waiter in the hash bucket queue. | 111 | * and the waker did not find the waiter in the hash bucket queue. |
109 | * The spinlock serializes that: | ||
110 | * | 112 | * |
111 | * CPU 0 CPU 1 | 113 | * The correct serialization ensures that a waiter either observes |
114 | * the changed user space value before blocking or is woken by a | ||
115 | * concurrent waker: | ||
116 | * | ||
117 | * CPU 0 CPU 1 | ||
112 | * val = *futex; | 118 | * val = *futex; |
113 | * sys_futex(WAIT, futex, val); | 119 | * sys_futex(WAIT, futex, val); |
114 | * futex_wait(futex, val); | 120 | * futex_wait(futex, val); |
115 | * lock(hash_bucket(futex)); | 121 | * |
116 | * uval = *futex; | 122 | * waiters++; |
117 | * *futex = newval; | 123 | * mb(); (A) <-- paired with -. |
118 | * sys_futex(WAKE, futex); | 124 | * | |
119 | * futex_wake(futex); | 125 | * lock(hash_bucket(futex)); | |
120 | * lock(hash_bucket(futex)); | 126 | * | |
127 | * uval = *futex; | | ||
128 | * | *futex = newval; | ||
129 | * | sys_futex(WAKE, futex); | ||
130 | * | futex_wake(futex); | ||
131 | * | | ||
132 | * `-------> mb(); (B) | ||
121 | * if (uval == val) | 133 | * if (uval == val) |
122 | * queue(); | 134 | * queue(); |
123 | * unlock(hash_bucket(futex)); | 135 | * unlock(hash_bucket(futex)); |
124 | * schedule(); if (!queue_empty()) | 136 | * schedule(); if (waiters) |
125 | * wake_waiters(futex); | 137 | * lock(hash_bucket(futex)); |
126 | * unlock(hash_bucket(futex)); | 138 | * wake_waiters(futex); |
139 | * unlock(hash_bucket(futex)); | ||
140 | * | ||
141 | * Where (A) orders the waiters increment and the futex value read -- this | ||
142 | * is guaranteed by the head counter in the hb spinlock; and where (B) | ||
143 | * orders the write to futex and the waiters read -- this is done by the | ||
144 | * barriers in get_futex_key_refs(), through either ihold or atomic_inc, | ||
145 | * depending on the futex type. | ||
146 | * | ||
147 | * This yields the following case (where X:=waiters, Y:=futex): | ||
148 | * | ||
149 | * X = Y = 0 | ||
150 | * | ||
151 | * w[X]=1 w[Y]=1 | ||
152 | * MB MB | ||
153 | * r[Y]=y r[X]=x | ||
154 | * | ||
155 | * Which guarantees that x==0 && y==0 is impossible; which translates back into | ||
156 | * the guarantee that we cannot both miss the futex variable change and the | ||
157 | * enqueue. | ||
127 | */ | 158 | */ |
128 | 159 | ||
129 | int __read_mostly futex_cmpxchg_enabled; | 160 | int __read_mostly futex_cmpxchg_enabled; |
@@ -211,6 +242,36 @@ static unsigned long __read_mostly futex_hashsize; | |||
211 | 242 | ||
212 | static struct futex_hash_bucket *futex_queues; | 243 | static struct futex_hash_bucket *futex_queues; |
213 | 244 | ||
245 | static inline void futex_get_mm(union futex_key *key) | ||
246 | { | ||
247 | atomic_inc(&key->private.mm->mm_count); | ||
248 | /* | ||
249 | * Ensure futex_get_mm() implies a full barrier such that | ||
250 | * get_futex_key() implies a full barrier. This is relied upon | ||
251 | * as full barrier (B), see the ordering comment above. | ||
252 | */ | ||
253 | smp_mb__after_atomic_inc(); | ||
254 | } | ||
255 | |||
256 | static inline bool hb_waiters_pending(struct futex_hash_bucket *hb) | ||
257 | { | ||
258 | #ifdef CONFIG_SMP | ||
259 | /* | ||
260 | * Tasks trying to enter the critical region are most likely | ||
261 | * potential waiters that will be added to the plist. Ensure | ||
262 | * that wakers won't miss to-be-slept tasks in the window between | ||
263 | * the wait call and the actual plist_add. | ||
264 | */ | ||
265 | if (spin_is_locked(&hb->lock)) | ||
266 | return true; | ||
267 | smp_rmb(); /* Make sure we check the lock state first */ | ||
268 | |||
269 | return !plist_head_empty(&hb->chain); | ||
270 | #else | ||
271 | return true; | ||
272 | #endif | ||
273 | } | ||
274 | |||
214 | /* | 275 | /* |
215 | * We hash on the keys returned from get_futex_key (see below). | 276 | * We hash on the keys returned from get_futex_key (see below). |
216 | */ | 277 | */ |
@@ -245,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key) | |||
245 | 306 | ||
246 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { | 307 | switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { |
247 | case FUT_OFF_INODE: | 308 | case FUT_OFF_INODE: |
248 | ihold(key->shared.inode); | 309 | ihold(key->shared.inode); /* implies MB (B) */ |
249 | break; | 310 | break; |
250 | case FUT_OFF_MMSHARED: | 311 | case FUT_OFF_MMSHARED: |
251 | atomic_inc(&key->private.mm->mm_count); | 312 | futex_get_mm(key); /* implies MB (B) */ |
252 | break; | 313 | break; |
253 | } | 314 | } |
254 | } | 315 | } |
@@ -322,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
322 | if (!fshared) { | 383 | if (!fshared) { |
323 | key->private.mm = mm; | 384 | key->private.mm = mm; |
324 | key->private.address = address; | 385 | key->private.address = address; |
325 | get_futex_key_refs(key); | 386 | get_futex_key_refs(key); /* implies MB (B) */ |
326 | return 0; | 387 | return 0; |
327 | } | 388 | } |
328 | 389 | ||
@@ -429,7 +490,7 @@ again: | |||
429 | key->shared.pgoff = basepage_index(page); | 490 | key->shared.pgoff = basepage_index(page); |
430 | } | 491 | } |
431 | 492 | ||
432 | get_futex_key_refs(key); | 493 | get_futex_key_refs(key); /* implies MB (B) */ |
433 | 494 | ||
434 | out: | 495 | out: |
435 | unlock_page(page_head); | 496 | unlock_page(page_head); |
@@ -1052,6 +1113,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
1052 | goto out; | 1113 | goto out; |
1053 | 1114 | ||
1054 | hb = hash_futex(&key); | 1115 | hb = hash_futex(&key); |
1116 | |||
1117 | /* Make sure we really have tasks to wakeup */ | ||
1118 | if (!hb_waiters_pending(hb)) | ||
1119 | goto out_put_key; | ||
1120 | |||
1055 | spin_lock(&hb->lock); | 1121 | spin_lock(&hb->lock); |
1056 | 1122 | ||
1057 | plist_for_each_entry_safe(this, next, &hb->chain, list) { | 1123 | plist_for_each_entry_safe(this, next, &hb->chain, list) { |
@@ -1072,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) | |||
1072 | } | 1138 | } |
1073 | 1139 | ||
1074 | spin_unlock(&hb->lock); | 1140 | spin_unlock(&hb->lock); |
1141 | out_put_key: | ||
1075 | put_futex_key(&key); | 1142 | put_futex_key(&key); |
1076 | out: | 1143 | out: |
1077 | return ret; | 1144 | return ret; |
@@ -1535,7 +1602,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) | |||
1535 | hb = hash_futex(&q->key); | 1602 | hb = hash_futex(&q->key); |
1536 | q->lock_ptr = &hb->lock; | 1603 | q->lock_ptr = &hb->lock; |
1537 | 1604 | ||
1538 | spin_lock(&hb->lock); | 1605 | spin_lock(&hb->lock); /* implies MB (A) */ |
1539 | return hb; | 1606 | return hb; |
1540 | } | 1607 | } |
1541 | 1608 | ||