aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempool.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2016-07-28 18:48:44 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-28 19:07:41 -0400
commit4e390b2b2f34b8daaabf2df1df0cf8f798b87ddb (patch)
treecdce74e799044ad21ab44fde1af0af9f8f898ffc /mm/mempool.c
parent1d2047fefa20e49072f6a37a7f71544e8cace529 (diff)
Revert "mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"
This reverts commit f9054c70d28b ("mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"). There has been a report about OOM killer invoked when swapping out to a dm-crypt device. The primary reason seems to be that the swapout out IO managed to completely deplete memory reserves. Ondrej was able to bisect and explained the issue by pointing to f9054c70d28b ("mm, mempool: only set __GFP_NOMEMALLOC if there are free elements"). The reason is that the swapout path is not throttled properly because the md-raid layer needs to allocate from the generic_make_request path which means it allocates from the PF_MEMALLOC context. dm layer uses mempool_alloc in order to guarantee a forward progress which used to inhibit access to memory reserves when using page allocator. This has changed by f9054c70d28b ("mm, mempool: only set __GFP_NOMEMALLOC if there are free elements") which has dropped the __GFP_NOMEMALLOC protection when the memory pool is depleted. If we are running out of memory and the only way forward to free memory is to perform swapout we just keep consuming memory reserves rather than throttling the mempool allocations and allowing the pending IO to complete up to a moment when the memory is depleted completely and there is no way forward but invoking the OOM killer. This is less than optimal. The original intention of f9054c70d28b was to help with the OOM situations where the oom victim depends on mempool allocation to make a forward progress. David has mentioned the following backtrace: schedule schedule_timeout io_schedule_timeout mempool_alloc __split_and_process_bio dm_request generic_make_request submit_bio mpage_readpages ext4_readpages __do_page_cache_readahead ra_submit filemap_fault handle_mm_fault __do_page_fault do_page_fault page_fault We do not know more about why the mempool is depleted without being replenished in time, though. In any case the dm layer shouldn't depend on any allocations outside of the dedicated pools so a forward progress should be guaranteed. If this is not the case then the dm should be fixed rather than papering over the problem and postponing it to later by accessing more memory reserves. mempools are a mechanism to maintain dedicated memory reserves to guaratee forward progress. Allowing them an unbounded access to the page allocator memory reserves is going against the whole purpose of this mechanism. Bisected by Ondrej Kozina. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160721145309.GR26379@dhcp22.suse.cz Signed-off-by: Michal Hocko <mhocko@suse.com> Reported-by: Ondrej Kozina <okozina@redhat.com> Reviewed-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: NeilBrown <neilb@suse.com> Cc: David Rientjes <rientjes@google.com> Cc: Mikulas Patocka <mpatocka@redhat.com> Cc: Ondrej Kozina <okozina@redhat.com> Cc: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp> Cc: Mel Gorman <mgorman@suse.de> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempool.c')
-rw-r--r--mm/mempool.c18
1 files changed, 3 insertions, 15 deletions
diff --git a/mm/mempool.c b/mm/mempool.c
index 8f65464da5de..47a659dedd44 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -306,7 +306,7 @@ EXPORT_SYMBOL(mempool_resize);
306 * returns NULL. Note that due to preallocation, this function 306 * returns NULL. Note that due to preallocation, this function
307 * *never* fails when called from process contexts. (it might 307 * *never* fails when called from process contexts. (it might
308 * fail if called from an IRQ context.) 308 * fail if called from an IRQ context.)
309 * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. 309 * Note: using __GFP_ZERO is not supported.
310 */ 310 */
311void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) 311void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
312{ 312{
@@ -315,27 +315,16 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
315 wait_queue_t wait; 315 wait_queue_t wait;
316 gfp_t gfp_temp; 316 gfp_t gfp_temp;
317 317
318 /* If oom killed, memory reserves are essential to prevent livelock */
319 VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC);
320 /* No element size to zero on allocation */
321 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); 318 VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
322
323 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); 319 might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
324 320
321 gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
325 gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ 322 gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
326 gfp_mask |= __GFP_NOWARN; /* failures are OK */ 323 gfp_mask |= __GFP_NOWARN; /* failures are OK */
327 324
328 gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); 325 gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
329 326
330repeat_alloc: 327repeat_alloc:
331 if (likely(pool->curr_nr)) {
332 /*
333 * Don't allocate from emergency reserves if there are
334 * elements available. This check is racy, but it will
335 * be rechecked each loop.
336 */
337 gfp_temp |= __GFP_NOMEMALLOC;
338 }
339 328
340 element = pool->alloc(gfp_temp, pool->pool_data); 329 element = pool->alloc(gfp_temp, pool->pool_data);
341 if (likely(element != NULL)) 330 if (likely(element != NULL))
@@ -359,12 +348,11 @@ repeat_alloc:
359 * We use gfp mask w/o direct reclaim or IO for the first round. If 348 * We use gfp mask w/o direct reclaim or IO for the first round. If
360 * alloc failed with that and @pool was empty, retry immediately. 349 * alloc failed with that and @pool was empty, retry immediately.
361 */ 350 */
362 if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { 351 if (gfp_temp != gfp_mask) {
363 spin_unlock_irqrestore(&pool->lock, flags); 352 spin_unlock_irqrestore(&pool->lock, flags);
364 gfp_temp = gfp_mask; 353 gfp_temp = gfp_mask;
365 goto repeat_alloc; 354 goto repeat_alloc;
366 } 355 }
367 gfp_temp = gfp_mask;
368 356
369 /* We must not sleep if !__GFP_DIRECT_RECLAIM */ 357 /* We must not sleep if !__GFP_DIRECT_RECLAIM */
370 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { 358 if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {