aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c117
1 files changed, 92 insertions, 25 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index fcc6850483fb..30971b5c0e2d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -75,17 +75,20 @@
75 * The waiter reads the futex value in user space and calls 75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires 76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value 77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not 78 * again and verifies that the data has not changed. If it has not changed
79 * changed it enqueues itself into the hash bucket, releases the hash 79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * bucket lock and schedules. 80 * and schedules.
81 * 81 *
82 * The waker side modifies the user space value of the futex and calls 82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This functions computes the hash bucket and acquires 83 * futex_wake(). This function computes the hash bucket and acquires the
84 * the hash bucket lock. Then it looks for waiters on that futex in the 84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * hash bucket and wakes them. 85 * bucket and wakes them.
86 * 86 *
87 * Note that the spin_lock serializes waiters and wakers, so that the 87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * following scenario is avoided: 88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
89 * 92 *
90 * CPU 0 CPU 1 93 * CPU 0 CPU 1
91 * val = *futex; 94 * val = *futex;
@@ -106,24 +109,52 @@
106 * This would cause the waiter on CPU 0 to wait forever because it 109 * This would cause the waiter on CPU 0 to wait forever because it
107 * missed the transition of the user space value from val to newval 110 * missed the transition of the user space value from val to newval
108 * and the waker did not find the waiter in the hash bucket queue. 111 * and the waker did not find the waiter in the hash bucket queue.
109 * The spinlock serializes that:
110 * 112 *
111 * CPU 0 CPU 1 113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
112 * val = *futex; 118 * val = *futex;
113 * sys_futex(WAIT, futex, val); 119 * sys_futex(WAIT, futex, val);
114 * futex_wait(futex, val); 120 * futex_wait(futex, val);
115 * lock(hash_bucket(futex)); 121 *
116 * uval = *futex; 122 * waiters++;
117 * *futex = newval; 123 * mb(); (A) <-- paired with -.
118 * sys_futex(WAKE, futex); 124 * |
119 * futex_wake(futex); 125 * lock(hash_bucket(futex)); |
120 * lock(hash_bucket(futex)); 126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
121 * if (uval == val) 133 * if (uval == val)
122 * queue(); 134 * queue();
123 * unlock(hash_bucket(futex)); 135 * unlock(hash_bucket(futex));
124 * schedule(); if (!queue_empty()) 136 * schedule(); if (waiters)
125 * wake_waiters(futex); 137 * lock(hash_bucket(futex));
126 * unlock(hash_bucket(futex)); 138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
127 */ 158 */
128 159
129int __read_mostly futex_cmpxchg_enabled; 160int __read_mostly futex_cmpxchg_enabled;
@@ -211,6 +242,36 @@ static unsigned long __read_mostly futex_hashsize;
211 242
212static struct futex_hash_bucket *futex_queues; 243static struct futex_hash_bucket *futex_queues;
213 244
245static inline void futex_get_mm(union futex_key *key)
246{
247 atomic_inc(&key->private.mm->mm_count);
248 /*
249 * Ensure futex_get_mm() implies a full barrier such that
250 * get_futex_key() implies a full barrier. This is relied upon
251 * as full barrier (B), see the ordering comment above.
252 */
253 smp_mb__after_atomic_inc();
254}
255
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
257{
258#ifdef CONFIG_SMP
259 /*
260 * Tasks trying to enter the critical region are most likely
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */
265 if (spin_is_locked(&hb->lock))
266 return true;
267 smp_rmb(); /* Make sure we check the lock state first */
268
269 return !plist_head_empty(&hb->chain);
270#else
271 return true;
272#endif
273}
274
214/* 275/*
215 * We hash on the keys returned from get_futex_key (see below). 276 * We hash on the keys returned from get_futex_key (see below).
216 */ 277 */
@@ -245,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
245 306
246 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 307 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
247 case FUT_OFF_INODE: 308 case FUT_OFF_INODE:
248 ihold(key->shared.inode); 309 ihold(key->shared.inode); /* implies MB (B) */
249 break; 310 break;
250 case FUT_OFF_MMSHARED: 311 case FUT_OFF_MMSHARED:
251 atomic_inc(&key->private.mm->mm_count); 312 futex_get_mm(key); /* implies MB (B) */
252 break; 313 break;
253 } 314 }
254} 315}
@@ -322,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
322 if (!fshared) { 383 if (!fshared) {
323 key->private.mm = mm; 384 key->private.mm = mm;
324 key->private.address = address; 385 key->private.address = address;
325 get_futex_key_refs(key); 386 get_futex_key_refs(key); /* implies MB (B) */
326 return 0; 387 return 0;
327 } 388 }
328 389
@@ -429,7 +490,7 @@ again:
429 key->shared.pgoff = basepage_index(page); 490 key->shared.pgoff = basepage_index(page);
430 } 491 }
431 492
432 get_futex_key_refs(key); 493 get_futex_key_refs(key); /* implies MB (B) */
433 494
434out: 495out:
435 unlock_page(page_head); 496 unlock_page(page_head);
@@ -1052,6 +1113,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1052 goto out; 1113 goto out;
1053 1114
1054 hb = hash_futex(&key); 1115 hb = hash_futex(&key);
1116
1117 /* Make sure we really have tasks to wakeup */
1118 if (!hb_waiters_pending(hb))
1119 goto out_put_key;
1120
1055 spin_lock(&hb->lock); 1121 spin_lock(&hb->lock);
1056 1122
1057 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1123 plist_for_each_entry_safe(this, next, &hb->chain, list) {
@@ -1072,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1072 } 1138 }
1073 1139
1074 spin_unlock(&hb->lock); 1140 spin_unlock(&hb->lock);
1141out_put_key:
1075 put_futex_key(&key); 1142 put_futex_key(&key);
1076out: 1143out:
1077 return ret; 1144 return ret;
@@ -1535,7 +1602,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1535 hb = hash_futex(&q->key); 1602 hb = hash_futex(&q->key);
1536 q->lock_ptr = &hb->lock; 1603 q->lock_ptr = &hb->lock;
1537 1604
1538 spin_lock(&hb->lock); 1605 spin_lock(&hb->lock); /* implies MB (A) */
1539 return hb; 1606 return hb;
1540} 1607}
1541 1608