1 files changed, 278 insertions, 201 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 77e4d3c5c57b..73f5d4556b3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -83,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -290,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * Determine how many pages need to be initialized durig early boot
+ * (non-deferred initialization).
+ * The value of first_deferred_pfn will be set later, once non-deferred pages
+ * are initialized, but for now set it ULONG_MAX.
+ */
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
-        unsigned long max_initialise;
+        phys_addr_t start_addr, end_addr;
-        unsigned long reserved_lowmem;
+        unsigned long max_pgcnt;
+        unsigned long reserved;
        /*
         * Initialise at least 2G of a node but also take into account that
         * two large system hashes that can take up 1GB for 0.25TB/node.
         */
-        max_initialise = max(2UL << (30 - PAGE_SHIFT),
+        max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
-                (pgdat->node_spanned_pages >> 8));
+                        (pgdat->node_spanned_pages >> 8));
        /*
         * Compensate the all the memblock reservations (e.g. crash kernel)
         * from the initial estimation to make sure we will initialize enough
         * memory to boot.
         */
-        reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
+        start_addr = PFN_PHYS(pgdat->node_start_pfn);
-                        pgdat->node_start_pfn + max_initialise);
+        end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
-        max_initialise += reserved_lowmem;
+        reserved = memblock_reserved_memory_within(start_addr, end_addr);
+        max_pgcnt += PHYS_PFN(reserved);
-        pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
+        pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
        pgdat->first_deferred_pfn = ULONG_MAX;
 }
@@ -338,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat,
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
        (*nr_initialised)++;
-        if ((*nr_initialised > pgdat->static_init_size) &&
+        if ((*nr_initialised > pgdat->static_init_pgcnt) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                pgdat->first_deferred_pfn = pfn;
                return false;
@@ -1013,7 +1023,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
        VM_BUG_ON_PAGE(PageTail(page), page);
        trace_mm_page_free(page, order);
-        kmemcheck_free_shadow(page, order);
        /*
         * Check tail pages before head page information is cleared to
@@ -1170,6 +1179,7 @@ static void free_one_page(struct zone *zone,
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid)
 {
+        mm_zero_struct_page(page);
        set_page_links(page, zone, nid, pfn);
        init_page_count(page);
        page_mapcount_reset(page);
@@ -1410,14 +1420,17 @@ void clear_zone_contiguous(struct zone *zone)
 }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void __init deferred_free_range(struct page *page,
+static void __init deferred_free_range(unsigned long pfn,
-                                        unsigned long pfn, int nr_pages)
+                                       unsigned long nr_pages)
 {
-        int i;
+        struct page *page;
+        unsigned long i;
-        if (!page)
+        if (!nr_pages)
                return;
+        page = pfn_to_page(pfn);
        /* Free a large naturally-aligned chunk if possible */
        if (nr_pages == pageblock_nr_pages &&
            (pfn & (pageblock_nr_pages - 1)) == 0) {
@@ -1443,19 +1456,109 @@ static inline void __init pgdat_init_report_one_done(void)
                complete(&pgdat_init_all_done_comp);
 }
+/*
+ * Helper for deferred_init_range, free the given range, reset the counters, and
+ * return number of pages freed.
+ */
+static inline unsigned long __init __def_free(unsigned long *nr_free,
+                                              unsigned long *free_base_pfn,
+                                              struct page **page)
+{
+        unsigned long nr = *nr_free;
+        deferred_free_range(*free_base_pfn, nr);
+        *free_base_pfn = 0;
+        *nr_free = 0;
+        *page = NULL;
+        return nr;
+}
+static unsigned long __init deferred_init_range(int nid, int zid,
+                                                unsigned long start_pfn,
+                                                unsigned long end_pfn)
+{
+        struct mminit_pfnnid_cache nid_init_state = { };
+        unsigned long nr_pgmask = pageblock_nr_pages - 1;
+        unsigned long free_base_pfn = 0;
+        unsigned long nr_pages = 0;
+        unsigned long nr_free = 0;
+        struct page *page = NULL;
+        unsigned long pfn;
+        /*
+         * First we check if pfn is valid on architectures where it is possible
+         * to have holes within pageblock_nr_pages. On systems where it is not
+         * possible, this function is optimized out.
+         *
+         * Then, we check if a current large page is valid by only checking the
+         * validity of the head pfn.
+         *
+         * meminit_pfn_in_nid is checked on systems where pfns can interleave
+         * within a node: a pfn is between start and end of a node, but does not
+         * belong to this memory node.
+         *
+         * Finally, we minimize pfn page lookups and scheduler checks by
+         * performing it only once every pageblock_nr_pages.
+         *
+         * We do it in two loops: first we initialize struct page, than free to
+         * buddy allocator, becuse while we are freeing pages we can access
+         * pages that are ahead (computing buddy page in __free_one_page()).
+         */
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+                if (!pfn_valid_within(pfn))
+                        continue;
+                if ((pfn & nr_pgmask) || pfn_valid(pfn)) {
+                        if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+                                if (page && (pfn & nr_pgmask))
+                                        page++;
+                                else
+                                        page = pfn_to_page(pfn);
+                                __init_single_page(page, pfn, zid, nid);
+                                cond_resched();
+                        }
+                }
+        }
+        page = NULL;
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+                if (!pfn_valid_within(pfn)) {
+                        nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+                } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) {
+                        nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+                } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+                        nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+                } else if (page && (pfn & nr_pgmask)) {
+                        page++;
+                        nr_free++;
+                } else {
+                        nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+                        page = pfn_to_page(pfn);
+                        free_base_pfn = pfn;
+                        nr_free = 1;
+                        cond_resched();
+                }
+        }
+        /* Free the last block of pages to allocator */
+        nr_pages += __def_free(&nr_free, &free_base_pfn, &page);
+        return nr_pages;
+}
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
        pg_data_t *pgdat = data;
        int nid = pgdat->node_id;
-        struct mminit_pfnnid_cache nid_init_state = { };
        unsigned long start = jiffies;
        unsigned long nr_pages = 0;
-        unsigned long walk_start, walk_end;
+        unsigned long spfn, epfn;
-        int i, zid;
+        phys_addr_t spa, epa;
+        int zid;
        struct zone *zone;
        unsigned long first_init_pfn = pgdat->first_deferred_pfn;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+        u64 i;
        if (first_init_pfn == ULONG_MAX) {
                pgdat_init_report_one_done();
@@ -1477,83 +1580,12 @@ static int __init deferred_init_memmap(void *data)
                if (first_init_pfn < zone_end_pfn(zone))
                        break;
        }
+        first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
-        for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+        for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
-                unsigned long pfn, end_pfn;
+                spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
-                struct page *page = NULL;
+                epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
-                struct page *free_base_page = NULL;
+                nr_pages += deferred_init_range(nid, zid, spfn, epfn);
-                unsigned long free_base_pfn = 0;
-                int nr_to_free = 0;
-                end_pfn = min(walk_end, zone_end_pfn(zone));
-                pfn = first_init_pfn;
-                if (pfn < walk_start)
-                        pfn = walk_start;
-                if (pfn < zone->zone_start_pfn)
-                        pfn = zone->zone_start_pfn;
-                for (; pfn < end_pfn; pfn++) {
-                        if (!pfn_valid_within(pfn))
-                                goto free_range;
-                        /*
-                         * Ensure pfn_valid is checked every
-                         * pageblock_nr_pages for memory holes
-                         */
-                        if ((pfn & (pageblock_nr_pages - 1)) == 0) {
-                                if (!pfn_valid(pfn)) {
-                                        page = NULL;
-                                        goto free_range;
-                                }
-                        }
-                        if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
-                                page = NULL;
-                                goto free_range;
-                        }
-                        /* Minimise pfn page lookups and scheduler checks */
-                        if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
-                                page++;
-                        } else {
-                                nr_pages += nr_to_free;
-                                deferred_free_range(free_base_page,
-                                                free_base_pfn, nr_to_free);
-                                free_base_page = NULL;
-                                free_base_pfn = nr_to_free = 0;
-                                page = pfn_to_page(pfn);
-                                cond_resched();
-                        }
-                        if (page->flags) {
-                                VM_BUG_ON(page_zone(page) != zone);
-                                goto free_range;
-                        }
-                        __init_single_page(page, pfn, zid, nid);
-                        if (!free_base_page) {
-                                free_base_page = page;
-                                free_base_pfn = pfn;
-                                nr_to_free = 0;
-                        }
-                        nr_to_free++;
-                        /* Where possible, batch up pages for a single free */
-                        continue;
-free_range:
-                        /* Free the current block of pages to allocator */
-                        nr_pages += nr_to_free;
-                        deferred_free_range(free_base_page, free_base_pfn,
-                                                                nr_to_free);
-                        free_base_page = NULL;
-                        free_base_pfn = nr_to_free = 0;
-                }
-                /* Free the last block of pages to allocator */
-                nr_pages += nr_to_free;
-                deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
-                first_init_pfn = max(end_pfn, first_init_pfn);
        }
        /* Sanity check that the next zone really is unpopulated */
@@ -1792,7 +1824,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
-static inline
+static __always_inline
 struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                                                int migratetype)
 {
@@ -1836,7 +1868,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
 };
 #ifdef CONFIG_CMA
-static struct page *__rmqueue_cma_fallback(struct zone *zone,
+static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
                                        unsigned int order)
 {
        return __rmqueue_smallest(zone, order, MIGRATE_CMA);
@@ -2217,7 +2249,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 * deviation from the rest of this file, to make the for loop
 * condition simpler.
 */
-static inline bool
+static __always_inline bool
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
        struct free_area *area;
@@ -2289,8 +2321,8 @@ do_steal:
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
-static struct page *__rmqueue(struct zone *zone, unsigned int order,
+static __always_inline struct page *
-                                int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype)
 {
        struct page *page;
@@ -2315,7 +2347,7 @@ retry:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
-                        int migratetype, bool cold)
+                        int migratetype)
 {
        int i, alloced = 0;
@@ -2329,19 +2361,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        continue;
                /*
-                 * Split buddy pages returned by expand() are received here
+                 * Split buddy pages returned by expand() are received here in
-                 * in physical page order. The page is added to the callers and
+                 * physical page order. The page is added to the tail of
-                 * list and the list head then moves forward. From the callers
+                 * caller's list. From the callers perspective, the linked list
-                 * perspective, the linked list is ordered by page number in
+                 * is ordered by page number under some conditions. This is
-                 * some conditions. This is useful for IO devices that can
+                 * useful for IO devices that can forward direction from the
-                 * merge IO requests if the physical pages are ordered
+                 * head, thus also in the physical page order. This is useful
-                 * properly.
+                 * for IO devices that can merge IO requests if the physical
+                 * pages are ordered properly.
                 */
-                if (likely(!cold))
+                list_add_tail(&page->lru, list);
-                        list_add(&page->lru, list);
-                else
-                        list_add_tail(&page->lru, list);
-                list = &page->lru;
                alloced++;
                if (is_migrate_cma(get_pcppage_migratetype(page)))
                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -2478,10 +2507,6 @@ void drain_all_pages(struct zone *zone)
        if (WARN_ON_ONCE(!mm_percpu_wq))
                return;
-        /* Workqueues cannot recurse */
-        if (current->flags & PF_WQ_WORKER)
-                return;
        /*
         * Do not drain if one is already in progress unless it's specific to
         * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -2590,24 +2615,25 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
-/*
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
- * Free a 0-order page
- * cold == true ? free a cold page : free a hot page
- */
-void free_hot_cold_page(struct page *page, bool cold)
 {
-        struct zone *zone = page_zone(page);
-        struct per_cpu_pages *pcp;
-        unsigned long flags;
-        unsigned long pfn = page_to_pfn(page);
        int migratetype;
        if (!free_pcp_prepare(page))
-                return;
+                return false;
        migratetype = get_pfnblock_migratetype(page, pfn);
        set_pcppage_migratetype(page, migratetype);
-        local_irq_save(flags);
+        return true;
+}
+static void free_unref_page_commit(struct page *page, unsigned long pfn)
+{
+        struct zone *zone = page_zone(page);
+        struct per_cpu_pages *pcp;
+        int migratetype;
+        migratetype = get_pcppage_migratetype(page);
        __count_vm_event(PGFREE);
        /*
@@ -2620,38 +2646,62 @@ void free_hot_cold_page(struct page *page, bool cold)
        if (migratetype >= MIGRATE_PCPTYPES) {
                if (unlikely(is_migrate_isolate(migratetype))) {
                        free_one_page(zone, page, pfn, 0, migratetype);
-                        goto out;
+                        return;
                }
                migratetype = MIGRATE_MOVABLE;
        }
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
-        if (!cold)
+        list_add(&page->lru, &pcp->lists[migratetype]);
-                list_add(&page->lru, &pcp->lists[migratetype]);
-        else
-                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        pcp->count++;
        if (pcp->count >= pcp->high) {
                unsigned long batch = READ_ONCE(pcp->batch);
                free_pcppages_bulk(zone, batch, pcp);
                pcp->count -= batch;
        }
+}
-out:
+/*
+ * Free a 0-order page
+ */
+void free_unref_page(struct page *page)
+{
+        unsigned long flags;
+        unsigned long pfn = page_to_pfn(page);
+        if (!free_unref_page_prepare(page, pfn))
+                return;
+        local_irq_save(flags);
+        free_unref_page_commit(page, pfn);
        local_irq_restore(flags);
 }
 /*
 * Free a list of 0-order pages
 */
-void free_hot_cold_page_list(struct list_head *list, bool cold)
+void free_unref_page_list(struct list_head *list)
 {
        struct page *page, *next;
+        unsigned long flags, pfn;
+        /* Prepare pages for freeing */
+        list_for_each_entry_safe(page, next, list, lru) {
+                pfn = page_to_pfn(page);
+                if (!free_unref_page_prepare(page, pfn))
+                        list_del(&page->lru);
+                set_page_private(page, pfn);
+        }
+        local_irq_save(flags);
        list_for_each_entry_safe(page, next, list, lru) {
-                trace_mm_page_free_batched(page, cold);
+                unsigned long pfn = page_private(page);
-                free_hot_cold_page(page, cold);
+                set_page_private(page, 0);
+                trace_mm_page_free_batched(page);
+                free_unref_page_commit(page, pfn);
        }
+        local_irq_restore(flags);
 }
 /*
@@ -2669,15 +2719,6 @@ void split_page(struct page *page, unsigned int order)
        VM_BUG_ON_PAGE(PageCompound(page), page);
        VM_BUG_ON_PAGE(!page_count(page), page);
-#ifdef CONFIG_KMEMCHECK
-        /*
-         * Split shadow pages too, because free(page[0]) would
-         * otherwise free the whole shadow.
-         */
-        if (kmemcheck_page_is_tracked(page))
-                split_page(virt_to_page(page[0].shadow), order);
-#endif
        for (i = 1; i < (1 << order); i++)
                set_page_refcounted(page + i);
        split_page_owner(page, order);
@@ -2743,6 +2784,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
        enum numa_stat_item local_stat = NUMA_LOCAL;
+        /* skip numa counters update if numa stats is disabled */
+        if (!static_branch_likely(&vm_numa_stat_key))
+                return;
        if (z->node != numa_node_id())
                local_stat = NUMA_OTHER;
@@ -2758,7 +2803,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
-                        bool cold, struct per_cpu_pages *pcp,
+                        struct per_cpu_pages *pcp,
                        struct list_head *list)
 {
        struct page *page;
@@ -2767,16 +2812,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                        pcp->batch, list,
-                                        migratetype, cold);
+                                        migratetype);
                        if (unlikely(list_empty(list)))
                                return NULL;
                }
-                if (cold)
+                page = list_first_entry(list, struct page, lru);
-                        page = list_last_entry(list, struct page, lru);
-                else
-                        page = list_first_entry(list, struct page, lru);
                list_del(&page->lru);
                pcp->count--;
        } while (check_new_pcp(page));
@@ -2791,14 +2832,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 {
        struct per_cpu_pages *pcp;
        struct list_head *list;
-        bool cold = ((gfp_flags & __GFP_COLD) != 0);
        struct page *page;
        unsigned long flags;
        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
-        page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+        page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
        if (page) {
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
                zone_statistics(preferred_zone, zone);
@@ -3006,9 +3046,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                if (!area->nr_free)
                        continue;
-                if (alloc_harder)
-                        return true;
                for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
                        if (!list_empty(&area->free_list[mt]))
                                return true;
@@ -3020,6 +3057,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                        return true;
                }
 #endif
+                if (alloc_harder &&
+                        !list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                        return true;
        }
        return false;
 }
@@ -3235,20 +3275,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
                return;
-        pr_warn("%s: ", current->comm);
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        pr_cont("%pV", &vaf);
+        pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+                        current->comm, &vaf, gfp_mask, &gfp_mask,
+                        nodemask_pr_args(nodemask));
        va_end(args);
-        pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
-        if (nodemask)
-                pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
-        else
-                pr_cont("(null)\n");
        cpuset_print_current_mems_allowed();
        dump_stack();
@@ -3868,8 +3902,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        enum compact_result compact_result;
        int compaction_retries;
        int no_progress_loops;
-        unsigned long alloc_start = jiffies;
-        unsigned int stall_timeout = 10 * HZ;
        unsigned int cpuset_mems_cookie;
        int reserve_flags;
@@ -4001,14 +4033,6 @@ retry:
        if (!can_direct_reclaim)
                goto nopage;
-        /* Make sure we know about allocations which stall for too long */
-        if (time_after(jiffies, alloc_start + stall_timeout)) {
-                warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
-                        "page allocation stalls for %ums, order:%u",
-                        jiffies_to_msecs(jiffies-alloc_start), order);
-                stall_timeout += 10 * HZ;
-        }
        /* Avoid recursion of direct reclaim */
        if (current->flags & PF_MEMALLOC)
                goto nopage;
@@ -4223,9 +4247,6 @@ out:
                page = NULL;
        }
-        if (kmemcheck_enabled && page)
-                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
        trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
        return page;
@@ -4262,7 +4283,7 @@ void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
                if (order == 0)
-                        free_hot_cold_page(page, false);
+                        free_unref_page(page);
                else
                        __free_pages_ok(page, order);
        }
@@ -4320,7 +4341,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
                unsigned int order = compound_order(page);
                if (order == 0)
-                        free_hot_cold_page(page, false);
+                        free_unref_page(page);
                else
                        __free_pages_ok(page, order);
        }
@@ -6126,6 +6147,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
        }
 }
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 {
        unsigned long __maybe_unused start = 0;
@@ -6135,7 +6157,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
        if (!pgdat->node_spanned_pages)
                return;
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
        start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
        offset = pgdat->node_start_pfn - start;
        /* ia64 gets its own node_mem_map, before this, without bootmem */
@@ -6157,6 +6178,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
                                                               pgdat->node_id);
                pgdat->node_mem_map = map + offset;
        }
+        pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
+                                __func__, pgdat->node_id, (unsigned long)pgdat,
+                                (unsigned long)pgdat->node_mem_map);
 #ifndef CONFIG_NEED_MULTIPLE_NODES
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
@@ -6169,8 +6193,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
        }
 #endif
-#endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
+#else
+static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
 void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                unsigned long node_start_pfn, unsigned long *zholes_size)
@@ -6197,16 +6223,49 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                                  zones_size, zholes_size);
        alloc_node_mem_map(pgdat);
-#ifdef CONFIG_FLAT_NODE_MEM_MAP
-        printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
-                nid, (unsigned long)pgdat,
-                (unsigned long)pgdat->node_mem_map);
-#endif
        reset_deferred_meminit(pgdat);
        free_area_init_core(pgdat);
 }
+#ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Only struct pages that are backed by physical memory are zeroed and
+ * initialized by going through __init_single_page(). But, there are some
+ * struct pages which are reserved in memblock allocator and their fields
+ * may be accessed (for example page_to_pfn() on some configuration accesses
+ * flags). We must explicitly zero those struct pages.
+ */
+void __paginginit zero_resv_unavail(void)
+{
+        phys_addr_t start, end;
+        unsigned long pfn;
+        u64 i, pgcnt;
+        /*
+         * Loop through ranges that are reserved, but do not have reported
+         * physical memory backing.
+         */
+        pgcnt = 0;
+        for_each_resv_unavail_range(i, &start, &end) {
+                for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
+                        mm_zero_struct_page(pfn_to_page(pfn));
+                        pgcnt++;
+                }
+        }
+        /*
+         * Struct pages that do not have backing memory. This could be because
+         * firmware is using some of this memory, or for some other reasons.
+         * Once memblock is changed so such behaviour is not allowed: i.e.
+         * list of "reserved" memory must be a subset of list of "memory", then
+         * this code can be removed.
+         */
+        if (pgcnt)
+                pr_info("Reserved but unavailable: %lld pages", pgcnt);
+}
+#endif /* CONFIG_HAVE_MEMBLOCK */
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 #if MAX_NUMNODES > 1
@@ -6630,6 +6689,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                        node_set_state(nid, N_MEMORY);
                check_for_memory(pgdat, nid);
        }
+        zero_resv_unavail();
 }
 static int __init cmdline_parse_core(char *p, unsigned long *core)
@@ -6793,6 +6853,7 @@ void __init free_area_init(unsigned long *zones_size)
 {
        free_area_init_node(0, zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
+        zero_resv_unavail();
 }
 static int page_alloc_cpu_dead(unsigned int cpu)
@@ -7305,18 +7366,17 @@ void *__init alloc_large_system_hash(const char *tablename,
        log2qty = ilog2(numentries);
-        /*
-         * memblock allocator returns zeroed memory already, so HASH_ZERO is
-         * currently not used when HASH_EARLY is specified.
-         */
        gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
        do {
                size = bucketsize << log2qty;
-                if (flags & HASH_EARLY)
+                if (flags & HASH_EARLY) {
-                        table = memblock_virt_alloc_nopanic(size, 0);
+                        if (flags & HASH_ZERO)
-                else if (hashdist)
+                                table = memblock_virt_alloc_nopanic(size, 0);
+                        else
+                                table = memblock_virt_alloc_raw(size, 0);
+                } else if (hashdist) {
                        table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-                else {
+                } else {
                        /*
                         * If bucketsize is not a power-of-two, we may free
                         * some pages at the end of hash table which
@@ -7353,10 +7413,10 @@ void *__init alloc_large_system_hash(const char *tablename,
 * race condition. So you can't expect this function should be exact.
 */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+                         int migratetype,
                         bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, iter, found;
-        int mt;
        /*
         * For avoiding noise data, lru_add_drain_all() should be called
@@ -7364,8 +7424,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
         */
        if (zone_idx(zone) == ZONE_MOVABLE)
                return false;
-        mt = get_pageblock_migratetype(page);
-        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
+        /*
+         * CMA allocations (alloc_contig_range) really need to mark isolate
+         * CMA pageblocks even when they are not movable in fact so consider
+         * them movable here.
+         */
+        if (is_migrate_cma(migratetype) &&
+                        is_migrate_cma(get_pageblock_migratetype(page)))
                return false;
        pfn = page_to_pfn(page);
@@ -7377,6 +7443,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                page = pfn_to_page(check);
+                if (PageReserved(page))
+                        return true;
                /*
                 * Hugepages are not in LRU lists, but they're movable.
                 * We need not scan over tail pages bacause we don't
@@ -7450,7 +7519,7 @@ bool is_pageblock_removable_nolock(struct page *page)
        if (!zone_spans_pfn(zone, pfn))
                return false;
-        return !has_unmovable_pages(zone, page, 0, true);
+        return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
 }
 #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
@@ -7546,6 +7615,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                .zone = page_zone(pfn_to_page(start)),
                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
+                .no_set_skip_hint = true,
                .gfp_mask = current_gfp_context(gfp_mask),
        };
        INIT_LIST_HEAD(&cc.migratepages);
@@ -7582,11 +7652,18 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        /*
         * In case of -EBUSY, we'd like to know which page causes problem.
-         * So, just fall through. We will check it in test_pages_isolated().
+         * So, just fall through. test_pages_isolated() has a tracepoint
+         * which will report the busy page.
+         *
+         * It is possible that busy pages could become available before
+         * the call to test_pages_isolated, and the range will actually be
+         * allocated.  So, if we fall through be sure to clear ret so that
+         * -EBUSY is not accidentally used or returned to caller.
         */
        ret = __alloc_contig_migrate_range(&cc, start, end);
        if (ret && ret != -EBUSY)
                goto done;
+        ret =0;
        /*
         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES