aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempool.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2012-01-10 18:08:23 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:30:45 -0500
commit5b990546e33477c34ee6fbc20fad6584386b46c3 (patch)
tree2e4aa8b194aefee6286a2d0040e15d31df5f66c1 /mm/mempool.c
parent564c81db19f3630f53a14bbceb7b85eb9660ded3 (diff)
mempool: fix and document synchronization and memory barrier usage
mempool_alloc/free() use undocumented smp_mb()'s. The code is slightly broken and misleading. The lockless part is in mempool_free(). It wants to determine whether the item being freed needs to be returned to the pool or backing allocator without grabbing pool->lock. Two things need to be guaranteed for correct operation. 1. pool->curr_nr + #allocated should never dip below pool->min_nr. 2. Waiters shouldn't be left dangling. For #1, The only necessary condition is that curr_nr visible at free is from after the allocation of the element being freed (details in the comment). For most cases, this is true without any barrier but there can be fringe cases where the allocated pointer is passed to the freeing task without going through memory barriers. To cover this case, wmb is necessary before returning from allocation and rmb is necessary before reading curr_nr. IOW, ALLOCATING TASK FREEING TASK update pool state after alloc; wmb(); pass pointer to freeing task; read pointer; rmb(); read pool state to free; The current code doesn't have wmb after pool update during allocation and may theoretically, on machines where unlock doesn't behave as full wmb, lead to pool depletion and deadlock. smp_wmb() needs to be added after successful allocation from reserved elements and smp_mb() in mempool_free() can be replaced with smp_rmb(). For #2, the waiter needs to add itself to waitqueue and then check the wait condition and the waker needs to update the wait condition and then wake up. Because waitqueue operations always go through full spinlock synchronization, there is no need for extra memory barriers. Furthermore, mempool_alloc() is already holding pool->lock when it decides that it needs to wait. There is no reason to do unlock - add waitqueue - test condition again. It can simply add itself to waitqueue while holding pool->lock and then unlock and sleep. This patch adds smp_wmb() after successful allocation from reserved pool, replaces smp_mb() in mempool_free() with smp_rmb() and extend pool->lock over waitqueue addition. More importantly, it explains what memory barriers do and how the lockless testing is correct. -v2: Oleg pointed out that unlock doesn't imply wmb. Added explicit smp_wmb() after successful allocation from reserved pool and updated comments accordingly. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempool.c')
-rw-r--r--mm/mempool.c61
1 files changed, 48 insertions, 13 deletions
diff --git a/mm/mempool.c b/mm/mempool.c
index e73641b79bb5..11f0d0a5e0f8 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -224,28 +224,31 @@ repeat_alloc:
224 if (likely(pool->curr_nr)) { 224 if (likely(pool->curr_nr)) {
225 element = remove_element(pool); 225 element = remove_element(pool);
226 spin_unlock_irqrestore(&pool->lock, flags); 226 spin_unlock_irqrestore(&pool->lock, flags);
227 /* paired with rmb in mempool_free(), read comment there */
228 smp_wmb();
227 return element; 229 return element;
228 } 230 }
229 spin_unlock_irqrestore(&pool->lock, flags);
230 231
231 /* We must not sleep in the GFP_ATOMIC case */ 232 /* We must not sleep in the GFP_ATOMIC case */
232 if (!(gfp_mask & __GFP_WAIT)) 233 if (!(gfp_mask & __GFP_WAIT)) {
234 spin_unlock_irqrestore(&pool->lock, flags);
233 return NULL; 235 return NULL;
236 }
234 237
235 /* Now start performing page reclaim */ 238 /* Let's wait for someone else to return an element to @pool */
236 gfp_temp = gfp_mask; 239 gfp_temp = gfp_mask;
237 init_wait(&wait); 240 init_wait(&wait);
238 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 241 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
239 smp_mb();
240 if (!pool->curr_nr) {
241 /*
242 * FIXME: this should be io_schedule(). The timeout is there
243 * as a workaround for some DM problems in 2.6.18.
244 */
245 io_schedule_timeout(5*HZ);
246 }
247 finish_wait(&pool->wait, &wait);
248 242
243 spin_unlock_irqrestore(&pool->lock, flags);
244
245 /*
246 * FIXME: this should be io_schedule(). The timeout is there as a
247 * workaround for some DM problems in 2.6.18.
248 */
249 io_schedule_timeout(5*HZ);
250
251 finish_wait(&pool->wait, &wait);
249 goto repeat_alloc; 252 goto repeat_alloc;
250} 253}
251EXPORT_SYMBOL(mempool_alloc); 254EXPORT_SYMBOL(mempool_alloc);
@@ -265,7 +268,39 @@ void mempool_free(void *element, mempool_t *pool)
265 if (unlikely(element == NULL)) 268 if (unlikely(element == NULL))
266 return; 269 return;
267 270
268 smp_mb(); 271 /*
272 * Paired with the wmb in mempool_alloc(). The preceding read is
273 * for @element and the following @pool->curr_nr. This ensures
274 * that the visible value of @pool->curr_nr is from after the
275 * allocation of @element. This is necessary for fringe cases
276 * where @element was passed to this task without going through
277 * barriers.
278 *
279 * For example, assume @p is %NULL at the beginning and one task
280 * performs "p = mempool_alloc(...);" while another task is doing
281 * "while (!p) cpu_relax(); mempool_free(p, ...);". This function
282 * may end up using curr_nr value which is from before allocation
283 * of @p without the following rmb.
284 */
285 smp_rmb();
286
287 /*
288 * For correctness, we need a test which is guaranteed to trigger
289 * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
290 * without locking achieves that and refilling as soon as possible
291 * is desirable.
292 *
293 * Because curr_nr visible here is always a value after the
294 * allocation of @element, any task which decremented curr_nr below
295 * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
296 * incremented to min_nr afterwards. If curr_nr gets incremented
297 * to min_nr after the allocation of @element, the elements
298 * allocated after that are subject to the same guarantee.
299 *
300 * Waiters happen iff curr_nr is 0 and the above guarantee also
301 * ensures that there will be frees which return elements to the
302 * pool waking up the waiters.
303 */
269 if (pool->curr_nr < pool->min_nr) { 304 if (pool->curr_nr < pool->min_nr) {
270 spin_lock_irqsave(&pool->lock, flags); 305 spin_lock_irqsave(&pool->lock, flags);
271 if (pool->curr_nr < pool->min_nr) { 306 if (pool->curr_nr < pool->min_nr) {