1 files changed, 234 insertions, 292 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
 *      1G machine -> (16M dma, 784M normal, 224M high)
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
                                        PB_migrate, PB_migrate_end);
 }
-bool oom_killer_disabled __read_mostly;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
-/* update __split_huge_page_refcount if you change this function */
-static int destroy_compound_page(struct page *page, unsigned long order)
-{
-        int i;
-        int nr_pages = 1 << order;
-        int bad = 0;
-        if (unlikely(compound_order(page) != order)) {
-                bad_page(page, "wrong compound order", 0);
-                bad++;
-        }
-        __ClearPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
-                struct page *p = page + i;
-                if (unlikely(!PageTail(p))) {
-                        bad_page(page, "PageTail not set", 0);
-                        bad++;
-                } else if (unlikely(p->first_page != page)) {
-                        bad_page(page, "first_page not consistent", 0);
-                        bad++;
-                }
-                __ClearPageTail(p);
-        }
-        return bad;
-}
 static inline void prep_zero_page(struct page *page, unsigned int order,
                                                        gfp_t gfp_flags)
 {
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                return 0;
        if (page_is_guard(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                if (page_zone_id(page) != page_zone_id(buddy))
                        return 0;
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
-                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                /*
                 * zone check is done late to avoid uselessly
                 * calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
                if (page_zone_id(page) != page_zone_id(buddy))
                        return 0;
+                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
                return 1;
        }
        return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
        int max_order = MAX_ORDER;
        VM_BUG_ON(!zone_is_initialized(zone));
+        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
-        if (unlikely(PageCompound(page)))
-                if (unlikely(destroy_compound_page(page, order)))
-                        return;
        VM_BUG_ON(migratetype == -1);
        if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
        spin_unlock(&zone->lock);
 }
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+        if (!IS_ENABLED(CONFIG_DEBUG_VM))
+                return 0;
+        if (unlikely(!PageTail(page))) {
+                bad_page(page, "PageTail not set", 0);
+                return 1;
+        }
+        if (unlikely(page->first_page != head_page)) {
+                bad_page(page, "first_page not consistent", 0);
+                return 1;
+        }
+        return 0;
+}
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
-        int i;
+        bool compound = PageCompound(page);
-        int bad = 0;
+        int i, bad = 0;
        VM_BUG_ON_PAGE(PageTail(page), page);
-        VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
+        VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
+        kasan_free_pages(page, order);
        if (PageAnon(page))
                page->mapping = NULL;
-        for (i = 0; i < (1 << order); i++)
+        bad += free_pages_check(page);
+        for (i = 1; i < (1 << order); i++) {
+                if (compound)
+                        bad += free_tail_pages_check(page, page + i);
                bad += free_pages_check(page + i);
+        }
        if (bad)
                return false;
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+                                                                int alloc_flags)
 {
        int i;
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
+        kasan_alloc_pages(page, order);
        if (gfp_flags & __GFP_ZERO)
                prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
        set_page_owner(page, order, gfp_flags);
+        /*
+         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+         * allocate the page. The expectation is that the caller is taking
+         * steps that will free more memory. The caller should avoid the page
+         * being used for !PFMEMALLOC purposes.
+         */
+        page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return 0;
 }
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
 }
 /*
- * If breaking a large block of pages, move all free pages to the preferred
+ * When we are falling back to another migratetype during allocation, try to
- * allocation list. If falling back for a reclaimable kernel allocation, be
+ * steal extra free pages from the same pageblocks to satisfy further
- * more aggressive about taking ownership of free pages.
+ * allocations, instead of polluting multiple pageblocks.
 *
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
+ * If we are stealing a relatively large buddy page, it is likely there will
- * nor move CMA pages to different free lists. We don't want unmovable pages
+ * be more free pages in the pageblock, so try to steal them all. For
- * to be allocated from MIGRATE_CMA areas.
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
 *
- * Returns the new migratetype of the pageblock (or the same old migratetype
+ * If we claim more than half of the pageblock, change pageblock's migratetype
- * if it was unchanged).
+ * as well.
 */
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
                                  int start_type, int fallback_type)
 {
        int current_order = page_order(page);
-        /*
-         * When borrowing from MIGRATE_CMA, we need to release the excess
-         * buddy pages to CMA itself. We also ensure the freepage_migratetype
-         * is set to CMA so it is returned to the correct freelist in case
-         * the page ends up being not actually allocated from the pcp lists.
-         */
-        if (is_migrate_cma(fallback_type))
-                return fallback_type;
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
                change_pageblock_range(page, current_order, start_type);
-                return start_type;
+                return;
        }
        if (current_order >= pageblock_order / 2 ||
            start_type == MIGRATE_RECLAIMABLE ||
+            start_type == MIGRATE_UNMOVABLE ||
            page_group_by_mobility_disabled) {
                int pages;
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
                /* Claim the whole block if over half of it is free */
                if (pages >= (1 << (pageblock_order-1)) ||
-                                page_group_by_mobility_disabled) {
+                                page_group_by_mobility_disabled)
                        set_pageblock_migratetype(page, start_type);
-                        return start_type;
-                }
        }
-        return fallback_type;
 }
 /* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
        struct free_area *area;
        unsigned int current_order;
        struct page *page;
-        int migratetype, new_type, i;
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1;
                                current_order >= order && current_order <= MAX_ORDER-1;
                                --current_order) {
+                int i;
                for (i = 0;; i++) {
-                        migratetype = fallbacks[start_migratetype][i];
+                        int migratetype = fallbacks[start_migratetype][i];
+                        int buddy_type = start_migratetype;
                        /* MIGRATE_RESERVE handled later if necessary */
                        if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                                        struct page, lru);
                        area->nr_free--;
-                        new_type = try_to_steal_freepages(zone, page,
+                        if (!is_migrate_cma(migratetype)) {
-                                                          start_migratetype,
+                                try_to_steal_freepages(zone, page,
-                                                          migratetype);
+                                                        start_migratetype,
+                                                        migratetype);
+                        } else {
+                                /*
+                                 * When borrowing from MIGRATE_CMA, we need to
+                                 * release the excess buddy pages to CMA
+                                 * itself, and we do not try to steal extra
+                                 * free pages.
+                                 */
+                                buddy_type = migratetype;
+                        }
                        /* Remove the page from the freelists */
                        list_del(&page->lru);
                        rmv_page_order(page);
                        expand(zone, page, order, current_order, area,
-                               new_type);
+                                        buddy_type);
-                        /* The freepage_migratetype may differ from pageblock's
+                        /*
+                         * The freepage_migratetype may differ from pageblock's
                         * migratetype depending on the decisions in
-                         * try_to_steal_freepages. This is OK as long as it does
+                         * try_to_steal_freepages(). This is OK as long as it
-                         * not differ for MIGRATE_CMA type.
+                         * does not differ for MIGRATE_CMA pageblocks. For CMA
+                         * we need to make sure unallocated pages flushed from
+                         * pcp lists are returned to the correct freelist.
                         */
-                        set_freepage_migratetype(page, new_type);
+                        set_freepage_migratetype(page, buddy_type);
                        trace_mm_page_alloc_extfrag(page, order, current_order,
-                                start_migratetype, migratetype, new_type);
+                                start_migratetype, migratetype);
                        return page;
                }
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
 }
 /*
- * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
+ * Allocate a page from the given zone. Use pcplists for order-0 allocations.
- * we cheat by calling it from here, in the order > 0 path.  Saves a branch
- * or two.
 */
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
        struct page *page;
        bool cold = ((gfp_flags & __GFP_COLD) != 0);
-again:
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
                struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
        local_irq_restore(flags);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
-        if (prep_new_page(page, order, gfp_flags))
-                goto again;
        return page;
 failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
 * a page.
 */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
-                struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
+                                                const struct alloc_context *ac)
-                struct zone *preferred_zone, int classzone_idx, int migratetype)
 {
+        struct zonelist *zonelist = ac->zonelist;
        struct zoneref *z;
        struct page *page = NULL;
        struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+        for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
-                                                high_zoneidx, nodemask) {
+                                                                ac->nodemask) {
                unsigned long mark;
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
                 * time the page has in memory before being reclaimed.
                 */
                if (alloc_flags & ALLOC_FAIR) {
-                        if (!zone_local(preferred_zone, zone))
+                        if (!zone_local(ac->preferred_zone, zone))
                                break;
                        if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
                                nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
                if (!zone_watermark_ok(zone, order, mark,
-                                       classzone_idx, alloc_flags)) {
+                                       ac->classzone_idx, alloc_flags)) {
                        int ret;
                        /* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
                        }
                        if (zone_reclaim_mode == 0 ||
-                            !zone_allows_reclaim(preferred_zone, zone))
+                            !zone_allows_reclaim(ac->preferred_zone, zone))
                                goto this_zone_full;
                        /*
@@ -2154,7 +2149,7 @@ zonelist_scan:
                        default:
                                /* did we reclaim enough */
                                if (zone_watermark_ok(zone, order, mark,
-                                                classzone_idx, alloc_flags))
+                                                ac->classzone_idx, alloc_flags))
                                        goto try_this_zone;
                                /*
@@ -2175,27 +2170,18 @@ zonelist_scan:
                }
 try_this_zone:
-                page = buffered_rmqueue(preferred_zone, zone, order,
+                page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                                gfp_mask, migratetype);
+                                                gfp_mask, ac->migratetype);
-                if (page)
+                if (page) {
-                        break;
+                        if (prep_new_page(page, order, gfp_mask, alloc_flags))
+                                goto try_this_zone;
+                        return page;
+                }
 this_zone_full:
                if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (page) {
-                /*
-                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
-                 * necessary to allocate the page. The expectation is
-                 * that the caller is taking steps that will free more
-                 * memory. The caller should avoid the page being used
-                 * for !PFMEMALLOC purposes.
-                 */
-                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
-                return page;
-        }
        /*
         * The first pass makes sure allocations are spread fairly within the
         * local node.  However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
                alloc_flags &= ~ALLOC_FAIR;
                if (nr_fair_skipped) {
                        zonelist_rescan = true;
-                        reset_alloc_batches(preferred_zone);
+                        reset_alloc_batches(ac->preferred_zone);
                }
                if (nr_online_nodes > 1)
                        zonelist_rescan = true;
@@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        const struct alloc_context *ac, unsigned long *did_some_progress)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype)
 {
        struct page *page;
-        /* Acquire the per-zone oom lock for each zone */
+        *did_some_progress = 0;
-        if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
-                schedule_timeout_uninterruptible(1);
-                return NULL;
-        }
        /*
-         * PM-freezer should be notified that there might be an OOM killer on
+         * Acquire the per-zone oom lock for each zone.  If that
-         * its way to kill and wake somebody up. This is too early and we might
+         * fails, somebody else is making progress for us.
-         * end up not killing anything but false positives are acceptable.
-         * See freeze_processes.
         */
-        note_oom_kill();
+        if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+                *did_some_progress = 1;
+                schedule_timeout_uninterruptible(1);
+                return NULL;
+        }
        /*
         * Go through the zonelist yet one more time, keep very high watermark
         * here, this is only to catch a parallel oom killing, we must fail if
         * we're still under heavy pressure.
         */
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+        page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
-                order, zonelist, high_zoneidx,
+                                        ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
-                ALLOC_WMARK_HIGH|ALLOC_CPUSET,
-                preferred_zone, classzone_idx, migratetype);
        if (page)
                goto out;
        if (!(gfp_mask & __GFP_NOFAIL)) {
+                /* Coredumps can quickly deplete all memory reserves */
+                if (current->flags & PF_DUMPCORE)
+                        goto out;
                /* The OOM killer will not help higher order allocs */
                if (order > PAGE_ALLOC_COSTLY_ORDER)
                        goto out;
                /* The OOM killer does not needlessly kill tasks for lowmem */
-                if (high_zoneidx < ZONE_NORMAL)
+                if (ac->high_zoneidx < ZONE_NORMAL)
+                        goto out;
+                /* The OOM killer does not compensate for light reclaim */
+                if (!(gfp_mask & __GFP_FS))
                        goto out;
                /*
                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
@@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+        if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+                *did_some_progress = 1;
 out:
-        oom_zonelist_unlock(zonelist, gfp_mask);
+        oom_zonelist_unlock(ac->zonelist, gfp_mask);
        return page;
 }
@@ -2391,10 +2377,9 @@ out:
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                enum migrate_mode mode, int *contended_compaction,
-        int classzone_idx, int migratetype, enum migrate_mode mode,
+                bool *deferred_compaction)
-        int *contended_compaction, bool *deferred_compaction)
 {
        unsigned long compact_result;
        struct page *page;
@@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                return NULL;
        current->flags |= PF_MEMALLOC;
-        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
+        compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-                                                nodemask, mode,
+                                                mode, contended_compaction);
-                                                contended_compaction,
-                                                alloc_flags, classzone_idx);
        current->flags &= ~PF_MEMALLOC;
        switch (compact_result) {
@@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
         */
        count_vm_event(COMPACTSTALL);
-        page = get_page_from_freelist(gfp_mask, nodemask,
+        page = get_page_from_freelist(gfp_mask, order,
-                        order, zonelist, high_zoneidx,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                        preferred_zone, classzone_idx, migratetype);
        if (page) {
                struct zone *zone = page_zone(page);
@@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                enum migrate_mode mode, int *contended_compaction,
-        int classzone_idx, int migratetype, enum migrate_mode mode,
+                bool *deferred_compaction)
-        int *contended_compaction, bool *deferred_compaction)
 {
        return NULL;
 }
@@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 /* Perform direct synchronous page reclaim */
 static int
-__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
+__perform_reclaim(gfp_t gfp_mask, unsigned int order,
-                  nodemask_t *nodemask)
+                                        const struct alloc_context *ac)
 {
        struct reclaim_state reclaim_state;
        int progress;
@@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
-        progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+        progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+                                                                ac->nodemask);
        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                int alloc_flags, const struct alloc_context *ac,
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+                unsigned long *did_some_progress)
-        int classzone_idx, int migratetype, unsigned long *did_some_progress)
 {
        struct page *page = NULL;
        bool drained = false;
-        *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+        *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
-                                               nodemask);
        if (unlikely(!(*did_some_progress)))
                return NULL;
        /* After successful reclaim, reconsider all zones for allocation */
        if (IS_ENABLED(CONFIG_NUMA))
-                zlc_clear_zones_full(zonelist);
+                zlc_clear_zones_full(ac->zonelist);
 retry:
-        page = get_page_from_freelist(gfp_mask, nodemask, order,
+        page = get_page_from_freelist(gfp_mask, order,
-                                        zonelist, high_zoneidx,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        preferred_zone, classzone_idx,
-                                        migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2534,36 +2510,30 @@ retry:
 */
 static inline struct page *
 __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                                const struct alloc_context *ac)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype)
 {
        struct page *page;
        do {
-                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                page = get_page_from_freelist(gfp_mask, order,
-                        zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
+                                                ALLOC_NO_WATERMARKS, ac);
-                        preferred_zone, classzone_idx, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+                        wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
+                                                                        HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
 }
-static void wake_all_kswapds(unsigned int order,
+static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
-                             struct zonelist *zonelist,
-                             enum zone_type high_zoneidx,
-                             struct zone *preferred_zone,
-                             nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
-                                                high_zoneidx, nodemask)
+                                                ac->high_zoneidx, ac->nodemask)
-                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+                wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
 }
 static inline int
@@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                                                struct alloc_context *ac)
-        nodemask_t *nodemask, struct zone *preferred_zone,
-        int classzone_idx, int migratetype)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        struct page *page = NULL;
@@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
            (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
-restart:
+retry:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                wake_all_kswapds(order, zonelist, high_zoneidx,
+                wake_all_kswapds(order, ac);
-                                preferred_zone, nodemask);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2674,18 +2641,16 @@ restart:
         * Find the true preferred zone if the allocation is unconstrained by
         * cpusets.
         */
-        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+        if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
                struct zoneref *preferred_zoneref;
-                preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+                preferred_zoneref = first_zones_zonelist(ac->zonelist,
-                                NULL, &preferred_zone);
+                                ac->high_zoneidx, NULL, &ac->preferred_zone);
-                classzone_idx = zonelist_zone_idx(preferred_zoneref);
+                ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
        }
-rebalance:
        /* This is the last chance, in general, before the goto nopage. */
-        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+        page = get_page_from_freelist(gfp_mask, order,
-                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
-                        preferred_zone, classzone_idx, migratetype);
        if (page)
                goto got_pg;
@@ -2696,11 +2661,10 @@ rebalance:
                 * the allocation is high priority and these type of
                 * allocations are system rather than user orientated
                 */
-                zonelist = node_zonelist(numa_node_id(), gfp_mask);
+                ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+                page = __alloc_pages_high_priority(gfp_mask, order, ac);
-                page = __alloc_pages_high_priority(gfp_mask, order,
-                                zonelist, high_zoneidx, nodemask,
-                                preferred_zone, classzone_idx, migratetype);
                if (page) {
                        goto got_pg;
                }
@@ -2729,11 +2693,9 @@ rebalance:
         * Try direct compaction. The first pass is asynchronous. Subsequent
         * attempts after direct reclaim are synchronous
         */
-        page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+        page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-                                        high_zoneidx, nodemask, alloc_flags,
+                                        migration_mode,
-                                        preferred_zone,
+                                        &contended_compaction,
-                                        classzone_idx, migratetype,
-                                        migration_mode, &contended_compaction,
                                        &deferred_compaction);
        if (page)
                goto got_pg;
@@ -2779,74 +2741,40 @@ rebalance:
                migration_mode = MIGRATE_SYNC_LIGHT;
        /* Try direct reclaim and then allocating */
-        page = __alloc_pages_direct_reclaim(gfp_mask, order,
+        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-                                        zonelist, high_zoneidx,
+                                                        &did_some_progress);
-                                        nodemask,
-                                        alloc_flags, preferred_zone,
-                                        classzone_idx, migratetype,
-                                        &did_some_progress);
        if (page)
                goto got_pg;
-        /*
-         * If we failed to make any progress reclaiming, then we are
-         * running out of options and have to consider going OOM
-         */
-        if (!did_some_progress) {
-                if (oom_gfp_allowed(gfp_mask)) {
-                        if (oom_killer_disabled)
-                                goto nopage;
-                        /* Coredumps can quickly deplete all memory reserves */
-                        if ((current->flags & PF_DUMPCORE) &&
-                            !(gfp_mask & __GFP_NOFAIL))
-                                goto nopage;
-                        page = __alloc_pages_may_oom(gfp_mask, order,
-                                        zonelist, high_zoneidx,
-                                        nodemask, preferred_zone,
-                                        classzone_idx, migratetype);
-                        if (page)
-                                goto got_pg;
-                        if (!(gfp_mask & __GFP_NOFAIL)) {
-                                /*
-                                 * The oom killer is not called for high-order
-                                 * allocations that may fail, so if no progress
-                                 * is being made, there are no other options and
-                                 * retrying is unlikely to help.
-                                 */
-                                if (order > PAGE_ALLOC_COSTLY_ORDER)
-                                        goto nopage;
-                                /*
-                                 * The oom killer is not called for lowmem
-                                 * allocations to prevent needlessly killing
-                                 * innocent tasks.
-                                 */
-                                if (high_zoneidx < ZONE_NORMAL)
-                                        goto nopage;
-                        }
-                        goto restart;
-                }
-        }
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, did_some_progress,
                                                pages_reclaimed)) {
+                /*
+                 * If we fail to make progress by freeing individual
+                 * pages, but the allocation wants us to keep going,
+                 * start OOM killing tasks.
+                 */
+                if (!did_some_progress) {
+                        page = __alloc_pages_may_oom(gfp_mask, order, ac,
+                                                        &did_some_progress);
+                        if (page)
+                                goto got_pg;
+                        if (!did_some_progress)
+                                goto nopage;
+                }
                /* Wait for some write requests to complete then retry */
-                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+                wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
-                goto rebalance;
+                goto retry;
        } else {
                /*
                 * High-order allocations do not necessarily loop after
                 * direct reclaim and reclaim/compaction depends on compaction
                 * being called after reclaim so call directly if necessary
                 */
-                page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
+                page = __alloc_pages_direct_compact(gfp_mask, order,
-                                        high_zoneidx, nodemask, alloc_flags,
+                                        alloc_flags, ac, migration_mode,
-                                        preferred_zone,
+                                        &contended_compaction,
-                                        classzone_idx, migratetype,
-                                        migration_mode, &contended_compaction,
                                        &deferred_compaction);
                if (page)
                        goto got_pg;
@@ -2854,11 +2782,7 @@ rebalance:
 nopage:
        warn_alloc_failed(gfp_mask, order, NULL);
-        return page;
 got_pg:
-        if (kmemcheck_enabled)
-                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
        return page;
 }
@@ -2869,14 +2793,16 @@ struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-        int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-        int classzone_idx;
+        gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+        struct alloc_context ac = {
+                .high_zoneidx = gfp_zone(gfp_mask),
+                .nodemask = nodemask,
+                .migratetype = gfpflags_to_migratetype(gfp_mask),
+        };
        gfp_mask &= gfp_allowed_mask;
@@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+        if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
+        /* We set it here, as __alloc_pages_slowpath might have changed it */
+        ac.zonelist = zonelist;
        /* The preferred zone is used for statistics later */
-        preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
+        preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-                                nodemask ? : &cpuset_current_mems_allowed,
+                                ac.nodemask ? : &cpuset_current_mems_allowed,
-                                &preferred_zone);
+                                &ac.preferred_zone);
-        if (!preferred_zone)
+        if (!ac.preferred_zone)
                goto out;
-        classzone_idx = zonelist_zone_idx(preferred_zoneref);
+        ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
        /* First allocation attempt */
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+        alloc_mask = gfp_mask|__GFP_HARDWALL;
-                        zonelist, high_zoneidx, alloc_flags,
+        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
-                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.
                 */
-                gfp_mask = memalloc_noio_flags(gfp_mask);
+                alloc_mask = memalloc_noio_flags(gfp_mask);
-                page = __alloc_pages_slowpath(gfp_mask, order,
-                                zonelist, high_zoneidx, nodemask,
+                page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-                                preferred_zone, classzone_idx, migratetype);
        }
-        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+        if (kmemcheck_enabled && page)
+                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+        trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 out:
        /*
@@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data)
        return 0;
 }
+static noinline void __init
+build_all_zonelists_init(void)
+{
+        __build_all_zonelists(NULL);
+        mminit_verify_zonelist();
+        cpuset_init_current_mems_allowed();
+}
 /*
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
 */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
        if (system_state == SYSTEM_BOOTING) {
-                __build_all_zonelists(NULL);
+                build_all_zonelists_init();
-                mminit_verify_zonelist();
-                cpuset_init_current_mems_allowed();
        } else {
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
@@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-        printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-                        (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
+                (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                arch_zone_highest_possible_pfn[i])
                        pr_cont("empty\n");
                else
-                        pr_cont("[mem %0#10lx-%0#10lx]\n",
+                        pr_cont("[mem %#018Lx-%#018Lx]\n",
-                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
+                                (u64)arch_zone_lowest_possible_pfn[i]
-                                (arch_zone_highest_possible_pfn[i]
+                                        << PAGE_SHIFT,
+                                ((u64)arch_zone_highest_possible_pfn[i]
                                        << PAGE_SHIFT) - 1);
        }
@@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        pr_info("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        pr_info("  Node %d: %#010lx\n", i,
+                        pr_info("  Node %d: %#018Lx\n", i,
-                               zone_movable_pfn[i] << PAGE_SHIFT);
+                               (u64)zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early node map */
        pr_info("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                pr_info("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
-                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+                        (u64)start_pfn << PAGE_SHIFT,
+                        ((u64)end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
        mminit_verify_pageflags_layout();