aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
authorDavidlohr Bueso <davidlohr@hp.com>2014-01-12 18:31:25 -0500
committerIngo Molnar <mingo@kernel.org>2014-01-13 05:45:21 -0500
commitb0c29f79ecea0b6fbcefc999e70f2843ae8306db (patch)
tree94e0d3d66c9b10d76cb0e635a352cae81d622530 /kernel/futex.c
parent99b60ce69734dfeda58c6184a326b9475ce1dba3 (diff)
futexes: Avoid taking the hb->lock if there's nothing to wake up
In futex_wake() there is clearly no point in taking the hb->lock if we know beforehand that there are no tasks to be woken. While the hash bucket's plist head is a cheap way of knowing this, we cannot rely 100% on it as there is a racy window between the futex_wait call and when the task is actually added to the plist. To this end, we couple it with the spinlock check as tasks trying to enter the critical region are most likely potential waiters that will be added to the plist, thus preventing tasks sleeping forever if wakers don't acknowledge all possible waiters. Furthermore, the futex ordering guarantees are preserved, ensuring that waiters either observe the changed user space value before blocking or is woken by a concurrent waker. For wakers, this is done by relying on the barriers in get_futex_key_refs() -- for archs that do not have implicit mb in atomic_inc(), we explicitly add them through a new futex_get_mm function. For waiters we rely on the fact that spin_lock calls already update the head counter, so spinners are visible even if the lock hasn't been acquired yet. For more details please refer to the updated comments in the code and related discussion: https://lkml.org/lkml/2013/11/26/556 Special thanks to tglx for careful review and feedback. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Reviewed-by: Darren Hart <dvhart@linux.intel.com> Reviewed-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Davidlohr Bueso <davidlohr@hp.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Jeff Mahoney <jeffm@suse.com> Cc: Scott Norton <scott.norton@hp.com> Cc: Tom Vaden <tom.vaden@hp.com> Cc: Aswin Chandramouleeswaran <aswin@hp.com> Cc: Waiman Long <Waiman.Long@hp.com> Cc: Jason Low <jason.low2@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1389569486-25487-5-git-send-email-davidlohr@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c117
1 files changed, 92 insertions, 25 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index fcc6850483fb..30971b5c0e2d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -75,17 +75,20 @@
75 * The waiter reads the futex value in user space and calls 75 * The waiter reads the futex value in user space and calls
76 * futex_wait(). This function computes the hash bucket and acquires 76 * futex_wait(). This function computes the hash bucket and acquires
77 * the hash bucket lock. After that it reads the futex user space value 77 * the hash bucket lock. After that it reads the futex user space value
78 * again and verifies that the data has not changed. If it has not 78 * again and verifies that the data has not changed. If it has not changed
79 * changed it enqueues itself into the hash bucket, releases the hash 79 * it enqueues itself into the hash bucket, releases the hash bucket lock
80 * bucket lock and schedules. 80 * and schedules.
81 * 81 *
82 * The waker side modifies the user space value of the futex and calls 82 * The waker side modifies the user space value of the futex and calls
83 * futex_wake(). This functions computes the hash bucket and acquires 83 * futex_wake(). This function computes the hash bucket and acquires the
84 * the hash bucket lock. Then it looks for waiters on that futex in the 84 * hash bucket lock. Then it looks for waiters on that futex in the hash
85 * hash bucket and wakes them. 85 * bucket and wakes them.
86 * 86 *
87 * Note that the spin_lock serializes waiters and wakers, so that the 87 * In futex wake up scenarios where no tasks are blocked on a futex, taking
88 * following scenario is avoided: 88 * the hb spinlock can be avoided and simply return. In order for this
89 * optimization to work, ordering guarantees must exist so that the waiter
90 * being added to the list is acknowledged when the list is concurrently being
91 * checked by the waker, avoiding scenarios like the following:
89 * 92 *
90 * CPU 0 CPU 1 93 * CPU 0 CPU 1
91 * val = *futex; 94 * val = *futex;
@@ -106,24 +109,52 @@
106 * This would cause the waiter on CPU 0 to wait forever because it 109 * This would cause the waiter on CPU 0 to wait forever because it
107 * missed the transition of the user space value from val to newval 110 * missed the transition of the user space value from val to newval
108 * and the waker did not find the waiter in the hash bucket queue. 111 * and the waker did not find the waiter in the hash bucket queue.
109 * The spinlock serializes that:
110 * 112 *
111 * CPU 0 CPU 1 113 * The correct serialization ensures that a waiter either observes
114 * the changed user space value before blocking or is woken by a
115 * concurrent waker:
116 *
117 * CPU 0 CPU 1
112 * val = *futex; 118 * val = *futex;
113 * sys_futex(WAIT, futex, val); 119 * sys_futex(WAIT, futex, val);
114 * futex_wait(futex, val); 120 * futex_wait(futex, val);
115 * lock(hash_bucket(futex)); 121 *
116 * uval = *futex; 122 * waiters++;
117 * *futex = newval; 123 * mb(); (A) <-- paired with -.
118 * sys_futex(WAKE, futex); 124 * |
119 * futex_wake(futex); 125 * lock(hash_bucket(futex)); |
120 * lock(hash_bucket(futex)); 126 * |
127 * uval = *futex; |
128 * | *futex = newval;
129 * | sys_futex(WAKE, futex);
130 * | futex_wake(futex);
131 * |
132 * `-------> mb(); (B)
121 * if (uval == val) 133 * if (uval == val)
122 * queue(); 134 * queue();
123 * unlock(hash_bucket(futex)); 135 * unlock(hash_bucket(futex));
124 * schedule(); if (!queue_empty()) 136 * schedule(); if (waiters)
125 * wake_waiters(futex); 137 * lock(hash_bucket(futex));
126 * unlock(hash_bucket(futex)); 138 * wake_waiters(futex);
139 * unlock(hash_bucket(futex));
140 *
141 * Where (A) orders the waiters increment and the futex value read -- this
142 * is guaranteed by the head counter in the hb spinlock; and where (B)
143 * orders the write to futex and the waiters read -- this is done by the
144 * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
145 * depending on the futex type.
146 *
147 * This yields the following case (where X:=waiters, Y:=futex):
148 *
149 * X = Y = 0
150 *
151 * w[X]=1 w[Y]=1
152 * MB MB
153 * r[Y]=y r[X]=x
154 *
155 * Which guarantees that x==0 && y==0 is impossible; which translates back into
156 * the guarantee that we cannot both miss the futex variable change and the
157 * enqueue.
127 */ 158 */
128 159
129int __read_mostly futex_cmpxchg_enabled; 160int __read_mostly futex_cmpxchg_enabled;
@@ -211,6 +242,36 @@ static unsigned long __read_mostly futex_hashsize;
211 242
212static struct futex_hash_bucket *futex_queues; 243static struct futex_hash_bucket *futex_queues;
213 244
245static inline void futex_get_mm(union futex_key *key)
246{
247 atomic_inc(&key->private.mm->mm_count);
248 /*
249 * Ensure futex_get_mm() implies a full barrier such that
250 * get_futex_key() implies a full barrier. This is relied upon
251 * as full barrier (B), see the ordering comment above.
252 */
253 smp_mb__after_atomic_inc();
254}
255
256static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
257{
258#ifdef CONFIG_SMP
259 /*
260 * Tasks trying to enter the critical region are most likely
261 * potential waiters that will be added to the plist. Ensure
262 * that wakers won't miss to-be-slept tasks in the window between
263 * the wait call and the actual plist_add.
264 */
265 if (spin_is_locked(&hb->lock))
266 return true;
267 smp_rmb(); /* Make sure we check the lock state first */
268
269 return !plist_head_empty(&hb->chain);
270#else
271 return true;
272#endif
273}
274
214/* 275/*
215 * We hash on the keys returned from get_futex_key (see below). 276 * We hash on the keys returned from get_futex_key (see below).
216 */ 277 */
@@ -245,10 +306,10 @@ static void get_futex_key_refs(union futex_key *key)
245 306
246 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { 307 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
247 case FUT_OFF_INODE: 308 case FUT_OFF_INODE:
248 ihold(key->shared.inode); 309 ihold(key->shared.inode); /* implies MB (B) */
249 break; 310 break;
250 case FUT_OFF_MMSHARED: 311 case FUT_OFF_MMSHARED:
251 atomic_inc(&key->private.mm->mm_count); 312 futex_get_mm(key); /* implies MB (B) */
252 break; 313 break;
253 } 314 }
254} 315}
@@ -322,7 +383,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
322 if (!fshared) { 383 if (!fshared) {
323 key->private.mm = mm; 384 key->private.mm = mm;
324 key->private.address = address; 385 key->private.address = address;
325 get_futex_key_refs(key); 386 get_futex_key_refs(key); /* implies MB (B) */
326 return 0; 387 return 0;
327 } 388 }
328 389
@@ -429,7 +490,7 @@ again:
429 key->shared.pgoff = basepage_index(page); 490 key->shared.pgoff = basepage_index(page);
430 } 491 }
431 492
432 get_futex_key_refs(key); 493 get_futex_key_refs(key); /* implies MB (B) */
433 494
434out: 495out:
435 unlock_page(page_head); 496 unlock_page(page_head);
@@ -1052,6 +1113,11 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1052 goto out; 1113 goto out;
1053 1114
1054 hb = hash_futex(&key); 1115 hb = hash_futex(&key);
1116
1117 /* Make sure we really have tasks to wakeup */
1118 if (!hb_waiters_pending(hb))
1119 goto out_put_key;
1120
1055 spin_lock(&hb->lock); 1121 spin_lock(&hb->lock);
1056 1122
1057 plist_for_each_entry_safe(this, next, &hb->chain, list) { 1123 plist_for_each_entry_safe(this, next, &hb->chain, list) {
@@ -1072,6 +1138,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1072 } 1138 }
1073 1139
1074 spin_unlock(&hb->lock); 1140 spin_unlock(&hb->lock);
1141out_put_key:
1075 put_futex_key(&key); 1142 put_futex_key(&key);
1076out: 1143out:
1077 return ret; 1144 return ret;
@@ -1535,7 +1602,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1535 hb = hash_futex(&q->key); 1602 hb = hash_futex(&q->key);
1536 q->lock_ptr = &hb->lock; 1603 q->lock_ptr = &hb->lock;
1537 1604
1538 spin_lock(&hb->lock); 1605 spin_lock(&hb->lock); /* implies MB (A) */
1539 return hb; 1606 return hb;
1540} 1607}
1541 1608