aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVlastimil Babka <vbabka@suse.cz>2017-01-24 18:18:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2017-01-24 19:26:14 -0500
commite47483bca2cc59a4593b37a270b16ee42b1d9f08 (patch)
tree4bc66bd4f2f87a30d231068468d29900685fb45f
parent5ce9bfef1d27944c119a397a9d827bef795487ce (diff)
mm, page_alloc: fix premature OOM when racing with cpuset mems update
Ganapatrao Kulkarni reported that the LTP test cpuset01 in stress mode triggers OOM killer in few seconds, despite lots of free memory. The test attempts to repeatedly fault in memory in one process in a cpuset, while changing allowed nodes of the cpuset between 0 and 1 in another process. The problem comes from insufficient protection against cpuset changes, which can cause get_page_from_freelist() to consider all zones as non-eligible due to nodemask and/or current->mems_allowed. This was masked in the past by sufficient retries, but since commit 682a3385e773 ("mm, page_alloc: inline the fast path of the zonelist iterator") we fix the preferred_zoneref once, and don't iterate over the whole zonelist in further attempts, thus the only eligible zones might be placed in the zonelist before our starting point and we always miss them. A previous patch fixed this problem for current->mems_allowed. However, cpuset changes also update the task's mempolicy nodemask. The fix has two parts. We have to repeat the preferred_zoneref search when we detect cpuset update by way of seqcount, and we have to check the seqcount before considering OOM. [akpm@linux-foundation.org: fix typo in comment] Link: http://lkml.kernel.org/r/20170120103843.24587-5-vbabka@suse.cz Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reported-by: Ganapatrao Kulkarni <gpkulkarni@gmail.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Michal Hocko <mhocko@suse.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/page_alloc.c35
1 files changed, 24 insertions, 11 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0df3c089d3af..f3e0c69a97b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3555,6 +3555,17 @@ retry_cpuset:
3555 no_progress_loops = 0; 3555 no_progress_loops = 0;
3556 compact_priority = DEF_COMPACT_PRIORITY; 3556 compact_priority = DEF_COMPACT_PRIORITY;
3557 cpuset_mems_cookie = read_mems_allowed_begin(); 3557 cpuset_mems_cookie = read_mems_allowed_begin();
3558 /*
3559 * We need to recalculate the starting point for the zonelist iterator
3560 * because we might have used different nodemask in the fast path, or
3561 * there was a cpuset modification and we are retrying - otherwise we
3562 * could end up iterating over non-eligible zones endlessly.
3563 */
3564 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3565 ac->high_zoneidx, ac->nodemask);
3566 if (!ac->preferred_zoneref->zone)
3567 goto nopage;
3568
3558 3569
3559 /* 3570 /*
3560 * The fast path uses conservative alloc_flags to succeed only until 3571 * The fast path uses conservative alloc_flags to succeed only until
@@ -3715,6 +3726,13 @@ retry:
3715 &compaction_retries)) 3726 &compaction_retries))
3716 goto retry; 3727 goto retry;
3717 3728
3729 /*
3730 * It's possible we raced with cpuset update so the OOM would be
3731 * premature (see below the nopage: label for full explanation).
3732 */
3733 if (read_mems_allowed_retry(cpuset_mems_cookie))
3734 goto retry_cpuset;
3735
3718 /* Reclaim has failed us, start killing things */ 3736 /* Reclaim has failed us, start killing things */
3719 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); 3737 page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
3720 if (page) 3738 if (page)
@@ -3728,10 +3746,11 @@ retry:
3728 3746
3729nopage: 3747nopage:
3730 /* 3748 /*
3731 * When updating a task's mems_allowed, it is possible to race with 3749 * When updating a task's mems_allowed or mempolicy nodemask, it is
3732 * parallel threads in such a way that an allocation can fail while 3750 * possible to race with parallel threads in such a way that our
3733 * the mask is being updated. If a page allocation is about to fail, 3751 * allocation can fail while the mask is being updated. If we are about
3734 * check if the cpuset changed during allocation and if so, retry. 3752 * to fail, check if the cpuset changed during allocation and if so,
3753 * retry.
3735 */ 3754 */
3736 if (read_mems_allowed_retry(cpuset_mems_cookie)) 3755 if (read_mems_allowed_retry(cpuset_mems_cookie))
3737 goto retry_cpuset; 3756 goto retry_cpuset;
@@ -3822,15 +3841,9 @@ no_zone:
3822 /* 3841 /*
3823 * Restore the original nodemask if it was potentially replaced with 3842 * Restore the original nodemask if it was potentially replaced with
3824 * &cpuset_current_mems_allowed to optimize the fast-path attempt. 3843 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
3825 * Also recalculate the starting point for the zonelist iterator or
3826 * we could end up iterating over non-eligible zones endlessly.
3827 */ 3844 */
3828 if (unlikely(ac.nodemask != nodemask)) { 3845 if (unlikely(ac.nodemask != nodemask))
3829 ac.nodemask = nodemask; 3846 ac.nodemask = nodemask;
3830 ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
3831 ac.high_zoneidx, ac.nodemask);
3832 /* If we have NULL preferred zone, slowpath wll handle that */
3833 }
3834 3847
3835 page = __alloc_pages_slowpath(alloc_mask, order, &ac); 3848 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
3836 3849