aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2009-02-04 18:12:14 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-02-05 15:56:48 -0500
commit777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 (patch)
tree342b79faee43af9705b5a8ca406565fda0ad08fd
parent40b0bb1e734700b81d2ec69367c035cd1537f4fa (diff)
wait: prevent exclusive waiter starvation
With exclusive waiters, every process woken up through the wait queue must ensure that the next waiter down the line is woken when it has finished. Interruptible waiters don't do that when aborting due to a signal. And if an aborting waiter is concurrently woken up through the waitqueue, noone will ever wake up the next waiter. This has been observed with __wait_on_bit_lock() used by lock_page_killable(): the first contender on the queue was aborting when the actual lock holder woke it up concurrently. The aborted contender didn't acquire the lock and therefor never did an unlock followed by waking up the next waiter. Add abort_exclusive_wait() which removes the process' wait descriptor from the waitqueue, iff still queued, or wakes up the next waiter otherwise. It does so under the waitqueue lock. Racing with a wake up means the aborting process is either already woken (removed from the queue) and will wake up the next waiter, or it will remove itself from the queue and the concurrent wake up will apply to the next waiter after it. Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and __wait_on_bit_lock() when they were interrupted by other means than a wake up through the queue. [akpm@linux-foundation.org: coding-style fixes] Reported-by: Chris Mason <chris.mason@oracle.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Mentored-by: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Matthew Wilcox <matthew@wil.cx> Cc: Chuck Lever <cel@citi.umich.edu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: <stable@kernel.org> ["after some testing"] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/wait.h11
-rw-r--r--kernel/sched.c4
-rw-r--r--kernel/wait.c59
3 files changed, 63 insertions, 11 deletions
diff --git a/include/linux/wait.h b/include/linux/wait.h
index ef609f842fac..a210ede73b56 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,6 +132,8 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
132 list_del(&old->task_list); 132 list_del(&old->task_list);
133} 133}
134 134
135void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
136 int nr_exclusive, int sync, void *key);
135void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); 137void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
136extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); 138extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
137extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); 139extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
@@ -333,16 +335,19 @@ do { \
333 for (;;) { \ 335 for (;;) { \
334 prepare_to_wait_exclusive(&wq, &__wait, \ 336 prepare_to_wait_exclusive(&wq, &__wait, \
335 TASK_INTERRUPTIBLE); \ 337 TASK_INTERRUPTIBLE); \
336 if (condition) \ 338 if (condition) { \
339 finish_wait(&wq, &__wait); \
337 break; \ 340 break; \
341 } \
338 if (!signal_pending(current)) { \ 342 if (!signal_pending(current)) { \
339 schedule(); \ 343 schedule(); \
340 continue; \ 344 continue; \
341 } \ 345 } \
342 ret = -ERESTARTSYS; \ 346 ret = -ERESTARTSYS; \
347 abort_exclusive_wait(&wq, &__wait, \
348 TASK_INTERRUPTIBLE, NULL); \
343 break; \ 349 break; \
344 } \ 350 } \
345 finish_wait(&wq, &__wait); \
346} while (0) 351} while (0)
347 352
348#define wait_event_interruptible_exclusive(wq, condition) \ 353#define wait_event_interruptible_exclusive(wq, condition) \
@@ -431,6 +436,8 @@ extern long interruptible_sleep_on_timeout(wait_queue_head_t *q,
431void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); 436void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
432void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); 437void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
433void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); 438void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
439void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
440 unsigned int mode, void *key);
434int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 441int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
435int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 442int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
436 443
diff --git a/kernel/sched.c b/kernel/sched.c
index 242d0d47a70d..8ee437a5ec1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function);
4697 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4697 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
4698 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4698 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4699 */ 4699 */
4700static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4700void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4701 int nr_exclusive, int sync, void *key) 4701 int nr_exclusive, int sync, void *key)
4702{ 4702{
4703 wait_queue_t *curr, *next; 4703 wait_queue_t *curr, *next;
4704 4704
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91} 91}
92EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
93 93
94/*
95 * finish_wait - clean up after waiting in a queue
96 * @q: waitqueue waited on
97 * @wait: wait descriptor
98 *
99 * Sets current thread back to running state and removes
100 * the wait descriptor from the given waitqueue if still
101 * queued.
102 */
94void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) 103void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
95{ 104{
96 unsigned long flags; 105 unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
117} 126}
118EXPORT_SYMBOL(finish_wait); 127EXPORT_SYMBOL(finish_wait);
119 128
129/*
130 * abort_exclusive_wait - abort exclusive waiting in a queue
131 * @q: waitqueue waited on
132 * @wait: wait descriptor
133 * @state: runstate of the waiter to be woken
134 * @key: key to identify a wait bit queue or %NULL
135 *
136 * Sets current thread back to running state and removes
137 * the wait descriptor from the given waitqueue if still
138 * queued.
139 *
140 * Wakes up the next waiter if the caller is concurrently
141 * woken up through the queue.
142 *
143 * This prevents waiter starvation where an exclusive waiter
144 * aborts and is woken up concurrently and noone wakes up
145 * the next waiter.
146 */
147void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
148 unsigned int mode, void *key)
149{
150 unsigned long flags;
151
152 __set_current_state(TASK_RUNNING);
153 spin_lock_irqsave(&q->lock, flags);
154 if (!list_empty(&wait->task_list))
155 list_del_init(&wait->task_list);
156 else if (waitqueue_active(q))
157 __wake_up_common(q, mode, 1, 0, key);
158 spin_unlock_irqrestore(&q->lock, flags);
159}
160EXPORT_SYMBOL(abort_exclusive_wait);
161
120int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) 162int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
121{ 163{
122 int ret = default_wake_function(wait, mode, sync, key); 164 int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
177__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 219__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
178 int (*action)(void *), unsigned mode) 220 int (*action)(void *), unsigned mode)
179{ 221{
180 int ret = 0;
181
182 do { 222 do {
223 int ret;
224
183 prepare_to_wait_exclusive(wq, &q->wait, mode); 225 prepare_to_wait_exclusive(wq, &q->wait, mode);
184 if (test_bit(q->key.bit_nr, q->key.flags)) { 226 if (!test_bit(q->key.bit_nr, q->key.flags))
185 if ((ret = (*action)(q->key.flags))) 227 continue;
186 break; 228 ret = action(q->key.flags);
187 } 229 if (!ret)
230 continue;
231 abort_exclusive_wait(wq, &q->wait, mode, &q->key);
232 return ret;
188 } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); 233 } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
189 finish_wait(wq, &q->wait); 234 finish_wait(wq, &q->wait);
190 return ret; 235 return 0;
191} 236}
192EXPORT_SYMBOL(__wait_on_bit_lock); 237EXPORT_SYMBOL(__wait_on_bit_lock);
193 238