summaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2016-05-20 19:57:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-20 20:58:30 -0400
commit0a0337e0d1d134465778a16f5cbea95086e8e9e0 (patch)
tree9ad66db1af93fe7c6482b30ab5c3b4f3ce0e648f /mm/page_alloc.c
parentcab1802b5f0dddea30547a7451fda8c7e4c593f0 (diff)
mm, oom: rework oom detection
__alloc_pages_slowpath has traditionally relied on the direct reclaim and did_some_progress as an indicator that it makes sense to retry allocation rather than declaring OOM. shrink_zones had to rely on zone_reclaimable if shrink_zone didn't make any progress to prevent from a premature OOM killer invocation - the LRU might be full of dirty or writeback pages and direct reclaim cannot clean those up. zone_reclaimable allows to rescan the reclaimable lists several times and restart if a page is freed. This is really subtle behavior and it might lead to a livelock when a single freed page keeps allocator looping but the current task will not be able to allocate that single page. OOM killer would be more appropriate than looping without any progress for unbounded amount of time. This patch changes OOM detection logic and pulls it out from shrink_zone which is too low to be appropriate for any high level decisions such as OOM which is per zonelist property. It is __alloc_pages_slowpath which knows how many attempts have been done and what was the progress so far therefore it is more appropriate to implement this logic. The new heuristic is implemented in should_reclaim_retry helper called from __alloc_pages_slowpath. It tries to be more deterministic and easier to follow. It builds on an assumption that retrying makes sense only if the currently reclaimable memory + free pages would allow the current allocation request to succeed (as per __zone_watermark_ok) at least for one zone in the usable zonelist. This alone wouldn't be sufficient, though, because the writeback might get stuck and reclaimable pages might be pinned for a really long time or even depend on the current allocation context. Therefore there is a backoff mechanism implemented which reduces the reclaim target after each reclaim round without any progress. This means that we should eventually converge to only NR_FREE_PAGES as the target and fail on the wmark check and proceed to OOM. The backoff is simple and linear with 1/16 of the reclaimable pages for each round without any progress. We are optimistic and reset counter for successful reclaim rounds. Costly high order pages mostly preserve their semantic and those without __GFP_REPEAT fail right away while those which have the flag set will back off after the amount of reclaimable pages reaches equivalent of the requested order. The only difference is that if there was no progress during the reclaim we rely on zone watermark check. This is more logical thing to do than previous 1<<order attempts which were a result of zone_reclaimable faking the progress. [vdavydov@virtuozzo.com: check classzone_idx for shrink_zone] [hannes@cmpxchg.org: separate the heuristic into should_reclaim_retry] [rientjes@google.com: use zone_page_state_snapshot for NR_FREE_PAGES] [rientjes@google.com: shrink_zones doesn't need to return anything] Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c100
1 files changed, 92 insertions, 8 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8bcc10616fab..fa39efc3a692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
3386 return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; 3386 return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
3387} 3387}
3388 3388
3389/*
3390 * Maximum number of reclaim retries without any progress before OOM killer
3391 * is consider as the only way to move forward.
3392 */
3393#define MAX_RECLAIM_RETRIES 16
3394
3395/*
3396 * Checks whether it makes sense to retry the reclaim to make a forward progress
3397 * for the given allocation request.
3398 * The reclaim feedback represented by did_some_progress (any progress during
3399 * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
3400 * pages) and no_progress_loops (number of reclaim rounds without any progress
3401 * in a row) is considered as well as the reclaimable pages on the applicable
3402 * zone list (with a backoff mechanism which is a function of no_progress_loops).
3403 *
3404 * Returns true if a retry is viable or false to enter the oom path.
3405 */
3406static inline bool
3407should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3408 struct alloc_context *ac, int alloc_flags,
3409 bool did_some_progress, unsigned long pages_reclaimed,
3410 int no_progress_loops)
3411{
3412 struct zone *zone;
3413 struct zoneref *z;
3414
3415 /*
3416 * Make sure we converge to OOM if we cannot make any progress
3417 * several times in the row.
3418 */
3419 if (no_progress_loops > MAX_RECLAIM_RETRIES)
3420 return false;
3421
3422 if (order > PAGE_ALLOC_COSTLY_ORDER) {
3423 if (pages_reclaimed >= (1<<order))
3424 return false;
3425
3426 if (did_some_progress)
3427 return true;
3428 }
3429
3430 /*
3431 * Keep reclaiming pages while there is a chance this will lead somewhere.
3432 * If none of the target zones can satisfy our allocation request even
3433 * if all reclaimable pages are considered then we are screwed and have
3434 * to go OOM.
3435 */
3436 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3437 ac->nodemask) {
3438 unsigned long available;
3439
3440 available = zone_reclaimable_pages(zone);
3441 available -= DIV_ROUND_UP(no_progress_loops * available,
3442 MAX_RECLAIM_RETRIES);
3443 available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3444
3445 /*
3446 * Would the allocation succeed if we reclaimed the whole
3447 * available?
3448 */
3449 if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
3450 ac->high_zoneidx, alloc_flags, available)) {
3451 /* Wait for some write requests to complete then retry */
3452 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
3453 return true;
3454 }
3455 }
3456
3457 return false;
3458}
3459
3389static inline struct page * 3460static inline struct page *
3390__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 3461__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3391 struct alloc_context *ac) 3462 struct alloc_context *ac)
@@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3397 unsigned long did_some_progress; 3468 unsigned long did_some_progress;
3398 enum migrate_mode migration_mode = MIGRATE_ASYNC; 3469 enum migrate_mode migration_mode = MIGRATE_ASYNC;
3399 enum compact_result compact_result; 3470 enum compact_result compact_result;
3471 int no_progress_loops = 0;
3400 3472
3401 /* 3473 /*
3402 * In the slowpath, we sanity check order to avoid ever trying to 3474 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3525,23 +3597,35 @@ retry:
3525 if (gfp_mask & __GFP_NORETRY) 3597 if (gfp_mask & __GFP_NORETRY)
3526 goto noretry; 3598 goto noretry;
3527 3599
3528 /* Keep reclaiming pages as long as there is reasonable progress */ 3600 /*
3529 pages_reclaimed += did_some_progress; 3601 * Do not retry costly high order allocations unless they are
3530 if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || 3602 * __GFP_REPEAT
3531 ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { 3603 */
3532 /* Wait for some write requests to complete then retry */ 3604 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
3533 wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); 3605 goto noretry;
3534 goto retry; 3606
3607 if (did_some_progress) {
3608 no_progress_loops = 0;
3609 pages_reclaimed += did_some_progress;
3610 } else {
3611 no_progress_loops++;
3535 } 3612 }
3536 3613
3614 if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
3615 did_some_progress > 0, pages_reclaimed,
3616 no_progress_loops))
3617 goto retry;
3618
3537 /* Reclaim has failed us, start killing things */ 3619 /* Reclaim has failed us, start killing things */
3538 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3620 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
3539 if (page) 3621 if (page)
3540 goto got_pg; 3622 goto got_pg;
3541 3623
3542 /* Retry as long as the OOM killer is making progress */ 3624 /* Retry as long as the OOM killer is making progress */
3543 if (did_some_progress) 3625 if (did_some_progress) {
3626 no_progress_loops = 0;
3544 goto retry; 3627 goto retry;
3628 }
3545 3629
3546noretry: 3630noretry:
3547 /* 3631 /*