1 files changed, 281 insertions, 123 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e95b5b7c9c3d..cde5dac6229a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -16,6 +16,7 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
 #include <linux/pagemap.h>
@@ -96,8 +97,12 @@ int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /* work_structs for global per-cpu drains */
+struct pcpu_drain {
+        struct zone *zone;
+        struct work_struct work;
+};
 DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
@@ -121,10 +126,8 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
+atomic_long_t _totalram_pages __read_mostly;
-static DEFINE_SPINLOCK(managed_page_count_lock);
+EXPORT_SYMBOL(_totalram_pages);
-unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 unsigned long totalcma_pages __read_mostly;
@@ -237,7 +240,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
-char * const migratetype_names[MIGRATE_TYPES] = {
+const char * const migratetype_names[MIGRATE_TYPES] = {
        "Unmovable",
        "Movable",
        "Reclaimable",
@@ -263,20 +266,21 @@ compound_page_dtor * const compound_page_dtors[] = {
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
 int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __meminitdata;
+static unsigned long nr_kernel_pages __initdata;
-static unsigned long nr_all_pages __meminitdata;
+static unsigned long nr_all_pages __initdata;
-static unsigned long dma_reserve __meminitdata;
+static unsigned long dma_reserve __initdata;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
 static unsigned long required_kernelcore __initdata;
 static unsigned long required_kernelcore_percent __initdata;
 static unsigned long required_movablecore __initdata;
 static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
 static bool mirrored_kernelcore __meminitdata;
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -294,6 +298,32 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+        if (!static_branch_unlikely(&deferred_pages))
+                kasan_free_pages(page, order);
+}
 /* Returns true if the struct page for the pfn is uninitialised */
 static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 {
@@ -326,8 +356,13 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
        /* Always populate low zones for address-constrained allocations */
        if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
                return false;
+        /*
+         * We start only with one section of pages, more pages are added as
+         * needed until the rest of deferred pages are initialized.
+         */
        nr_initialised++;
-        if ((nr_initialised > NODE_DATA(nid)->static_init_pgcnt) &&
+        if ((nr_initialised > PAGES_PER_SECTION) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                NODE_DATA(nid)->first_deferred_pfn = pfn;
                return true;
@@ -335,6 +370,8 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
        return false;
 }
 #else
+#define kasan_free_nondeferred_pages(p, o)      kasan_free_pages(p, o)
 static inline bool early_page_uninitialised(unsigned long pfn)
 {
        return false;
@@ -426,6 +463,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
        unsigned long old_word, word;
        BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+        BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
        bitmap = get_pageblock_bitmap(page, pfn);
        bitidx = pfn_to_bitidx(page, pfn);
@@ -1037,7 +1075,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
        arch_free_page(page, order);
        kernel_poison_pages(page, 1 << order, 0);
        kernel_map_pages(page, 1 << order, 0);
-        kasan_free_pages(page, order);
+        kasan_free_nondeferred_pages(page, order);
        return true;
 }
@@ -1183,6 +1221,7 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
        init_page_count(page);
        page_mapcount_reset(page);
        page_cpupid_reset_last(page);
+        page_kasan_tag_reset(page);
        INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
@@ -1279,7 +1318,7 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order)
        __ClearPageReserved(p);
        set_page_count(p, 0);
-        page_zone(page)->managed_pages += nr_pages;
+        atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
        set_page_refcounted(page);
        __free_pages(page, order);
 }
@@ -1606,13 +1645,6 @@ static int __init deferred_init_memmap(void *data)
 }
 /*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-/*
 * If this zone has deferred pages, try to grow it by initializing enough
 * deferred pages to satisfy the allocation specified by order, rounded up to
 * the nearest PAGES_PER_SECTION boundary.  So we're adding memory in increments
@@ -1981,8 +2013,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 */
 static int fallbacks[MIGRATE_TYPES][4] = {
        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
-        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
 #ifdef CONFIG_CMA
        [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
 #endif
@@ -2129,6 +2161,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
        return false;
 }
+static inline void boost_watermark(struct zone *zone)
+{
+        unsigned long max_boost;
+        if (!watermark_boost_factor)
+                return;
+        max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+                        watermark_boost_factor, 10000);
+        max_boost = max(pageblock_nr_pages, max_boost);
+        zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+                max_boost);
+}
 /*
 * This function implements actual steal behaviour. If order is large enough,
 * we can steal whole pageblock. If not, we first move freepages in this
@@ -2138,7 +2185,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 * itself, so pages freed in the future will be put on the correct free list.
 */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-                                        int start_type, bool whole_block)
+                unsigned int alloc_flags, int start_type, bool whole_block)
 {
        unsigned int current_order = page_order(page);
        struct free_area *area;
@@ -2160,6 +2207,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
                goto single_page;
        }
+        /*
+         * Boost watermarks to increase reclaim pressure to reduce the
+         * likelihood of future fallbacks. Wake kswapd now as the node
+         * may be balanced overall and kswapd will not wake naturally.
+         */
+        boost_watermark(zone);
+        if (alloc_flags & ALLOC_KSWAPD)
+                wakeup_kswapd(zone, 0, 0, zone_idx(zone));
        /* We are not allowed to try stealing from the whole block */
        if (!whole_block)
                goto single_page;
@@ -2258,7 +2314,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
         * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
         * Check is race-prone but harmless.
         */
-        max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+        max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
        if (zone->nr_reserved_highatomic >= max_managed)
                return;
@@ -2375,20 +2431,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 * condition simpler.
 */
 static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+                                                unsigned int alloc_flags)
 {
        struct free_area *area;
        int current_order;
+        int min_order = order;
        struct page *page;
        int fallback_mt;
        bool can_steal;
        /*
+         * Do not steal pages from freelists belonging to other pageblocks
+         * i.e. orders < pageblock_order. If there are no local zones free,
+         * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+         */
+        if (alloc_flags & ALLOC_NOFRAGMENT)
+                min_order = pageblock_order;
+        /*
         * Find the largest available free page in the other list. This roughly
         * approximates finding the pageblock with the most free pages, which
         * would be too costly to do exactly.
         */
-        for (current_order = MAX_ORDER - 1; current_order >= order;
+        for (current_order = MAX_ORDER - 1; current_order >= min_order;
                                --current_order) {
                area = &(zone->free_area[current_order]);
                fallback_mt = find_suitable_fallback(area, current_order,
@@ -2433,7 +2499,8 @@ do_steal:
        page = list_first_entry(&area->free_list[fallback_mt],
                                                        struct page, lru);
-        steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+        steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+                                                                can_steal);
        trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, fallback_mt);
@@ -2447,7 +2514,8 @@ do_steal:
 * Call me with the zone->lock already held.
 */
 static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+                                                unsigned int alloc_flags)
 {
        struct page *page;
@@ -2457,7 +2525,8 @@ retry:
                if (migratetype == MIGRATE_MOVABLE)
                        page = __rmqueue_cma_fallback(zone, order);
-                if (!page && __rmqueue_fallback(zone, order, migratetype))
+                if (!page && __rmqueue_fallback(zone, order, migratetype,
+                                                                alloc_flags))
                        goto retry;
        }
@@ -2472,13 +2541,14 @@ retry:
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
-                        int migratetype)
+                        int migratetype, unsigned int alloc_flags)
 {
        int i, alloced = 0;
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                struct page *page = __rmqueue(zone, order, migratetype);
+                struct page *page = __rmqueue(zone, order, migratetype,
+                                                                alloc_flags);
                if (unlikely(page == NULL))
                        break;
@@ -2592,6 +2662,10 @@ void drain_local_pages(struct zone *zone)
 static void drain_local_pages_wq(struct work_struct *work)
 {
+        struct pcpu_drain *drain;
+        drain = container_of(work, struct pcpu_drain, work);
        /*
         * drain_all_pages doesn't use proper cpu hotplug protection so
         * we can race with cpu offline when the WQ can move this from
@@ -2600,7 +2674,7 @@ static void drain_local_pages_wq(struct work_struct *work)
         * a different one.
         */
        preempt_disable();
-        drain_local_pages(NULL);
+        drain_local_pages(drain->zone);
        preempt_enable();
 }
@@ -2671,12 +2745,14 @@ void drain_all_pages(struct zone *zone)
        }
        for_each_cpu(cpu, &cpus_with_pcps) {
-                struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+                struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-                INIT_WORK(work, drain_local_pages_wq);
-                queue_work_on(cpu, mm_percpu_wq, work);
+                drain->zone = zone;
+                INIT_WORK(&drain->work, drain_local_pages_wq);
+                queue_work_on(cpu, mm_percpu_wq, &drain->work);
        }
        for_each_cpu(cpu, &cpus_with_pcps)
-                flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+                flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
        mutex_unlock(&pcpu_drain_mutex);
 }
@@ -2934,6 +3010,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+                        unsigned int alloc_flags,
                        struct per_cpu_pages *pcp,
                        struct list_head *list)
 {
@@ -2943,7 +3020,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                        pcp->batch, list,
-                                        migratetype);
+                                        migratetype, alloc_flags);
                        if (unlikely(list_empty(list)))
                                return NULL;
                }
@@ -2959,7 +3036,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 /* Lock and remove page from the per-cpu list */
 static struct page *rmqueue_pcplist(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
-                        gfp_t gfp_flags, int migratetype)
+                        gfp_t gfp_flags, int migratetype,
+                        unsigned int alloc_flags)
 {
        struct per_cpu_pages *pcp;
        struct list_head *list;
@@ -2969,7 +3047,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
-        page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
+        page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
        if (page) {
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
                zone_statistics(preferred_zone, zone);
@@ -2992,7 +3070,7 @@ struct page *rmqueue(struct zone *preferred_zone,
        if (likely(order == 0)) {
                page = rmqueue_pcplist(preferred_zone, zone, order,
-                                gfp_flags, migratetype);
+                                gfp_flags, migratetype, alloc_flags);
                goto out;
        }
@@ -3011,7 +3089,7 @@ struct page *rmqueue(struct zone *preferred_zone,
                                trace_mm_page_alloc_zone_locked(page, order, migratetype);
                }
                if (!page)
-                        page = __rmqueue(zone, order, migratetype);
+                        page = __rmqueue(zone, order, migratetype, alloc_flags);
        } while (page && check_new_pages(page, order));
        spin_unlock(&zone->lock);
        if (!page)
@@ -3053,7 +3131,7 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
                return false;
@@ -3103,13 +3181,19 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+        return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
 /*
 * Return true if free base pages are above 'mark'. For high-order checks it
 * will return true of the order-0 watermark is reached and there is at least
@@ -3254,6 +3338,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 #endif  /* CONFIG_NUMA */
 /*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+        unsigned int alloc_flags = 0;
+        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+                alloc_flags |= ALLOC_KSWAPD;
+#ifdef CONFIG_ZONE_DMA32
+        if (zone_idx(zone) != ZONE_NORMAL)
+                goto out;
+        /*
+         * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+         * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+         * on UMA that if Normal is populated then so is DMA32.
+         */
+        BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+        if (nr_online_nodes > 1 && !populated_zone(--zone))
+                goto out;
+out:
+#endif /* CONFIG_ZONE_DMA32 */
+        return alloc_flags;
+}
+/*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
 */
@@ -3261,14 +3379,18 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                                                const struct alloc_context *ac)
 {
-        struct zoneref *z = ac->preferred_zoneref;
+        struct zoneref *z;
        struct zone *zone;
        struct pglist_data *last_pgdat_dirty_limit = NULL;
+        bool no_fallback;
+retry:
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
         */
+        no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+        z = ac->preferred_zoneref;
        for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                                                ac->nodemask) {
                struct page *page;
@@ -3307,7 +3429,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                        }
                }
-                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+                if (no_fallback && nr_online_nodes > 1 &&
+                    zone != ac->preferred_zoneref->zone) {
+                        int local_nid;
+                        /*
+                         * If moving to a remote node, retry but allow
+                         * fragmenting fallbacks. Locality is more important
+                         * than fragmentation avoidance.
+                         */
+                        local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+                        if (zone_to_nid(zone) != local_nid) {
+                                alloc_flags &= ~ALLOC_NOFRAGMENT;
+                                goto retry;
+                        }
+                }
+                mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
                if (!zone_watermark_fast(zone, order, mark,
                                       ac_classzone_idx(ac), alloc_flags)) {
                        int ret;
@@ -3374,6 +3512,15 @@ try_this_zone:
                }
        }
+        /*
+         * It's possible on a UMA machine to get through all zones that are
+         * fragmented. If avoiding fragmentation, reset and try again.
+         */
+        if (no_fallback) {
+                alloc_flags &= ~ALLOC_NOFRAGMENT;
+                goto retry;
+        }
        return NULL;
 }
@@ -3413,13 +3560,13 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
        va_start(args, fmt);
        vaf.fmt = fmt;
        vaf.va = &args;
-        pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+        pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
                        current->comm, &vaf, gfp_mask, &gfp_mask,
                        nodemask_pr_args(nodemask));
        va_end(args);
        cpuset_print_current_mems_allowed();
+        pr_cont("\n");
        dump_stack();
        warn_alloc_show_mem(gfp_mask, nodemask);
 }
@@ -3861,6 +4008,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
+        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+                alloc_flags |= ALLOC_KSWAPD;
 #ifdef CONFIG_CMA
        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
@@ -4092,7 +4242,7 @@ retry_cpuset:
        if (!ac->preferred_zoneref->zone)
                goto nopage;
-        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+        if (alloc_flags & ALLOC_KSWAPD)
                wake_all_kswapds(order, gfp_mask, ac);
        /*
@@ -4150,7 +4300,7 @@ retry_cpuset:
 retry:
        /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
-        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+        if (alloc_flags & ALLOC_KSWAPD)
                wake_all_kswapds(order, gfp_mask, ac);
        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4369,6 +4519,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
        finalise_ac(gfp_mask, &ac);
+        /*
+         * Forbid the first pass from falling back to types that fragment
+         * memory until all local zones are considered.
+         */
+        alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
        /* First allocation attempt */
        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
        if (likely(page))
@@ -4427,16 +4583,19 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
 {
-        if (put_page_testzero(page)) {
+        if (order == 0)         /* Via pcp? */
-                if (order == 0)
+                free_unref_page(page);
-                        free_unref_page(page);
+        else
-                else
+                __free_pages_ok(page, order);
-                        __free_pages_ok(page, order);
-        }
 }
+void __free_pages(struct page *page, unsigned int order)
+{
+        if (put_page_testzero(page))
+                free_the_page(page, order);
+}
 EXPORT_SYMBOL(__free_pages);
 void free_pages(unsigned long addr, unsigned int order)
@@ -4485,14 +4644,8 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
-        if (page_ref_sub_and_test(page, count)) {
+        if (page_ref_sub_and_test(page, count))
-                unsigned int order = compound_order(page);
+                free_the_page(page, compound_order(page));
-                if (order == 0)
-                        free_unref_page(page);
-                else
-                        __free_pages_ok(page, order);
-        }
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4558,7 +4711,7 @@ void page_frag_free(void *addr)
        struct page *page = virt_to_head_page(addr);
        if (unlikely(put_page_testzero(page)))
-                __free_pages_ok(page, compound_order(page));
+                free_the_page(page, compound_order(page));
 }
 EXPORT_SYMBOL(page_frag_free);
@@ -4660,7 +4813,7 @@ static unsigned long nr_free_zone_pages(int offset)
        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
        for_each_zone_zonelist(zone, z, zonelist, offset) {
-                unsigned long size = zone->managed_pages;
+                unsigned long size = zone_managed_pages(zone);
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
@@ -4712,7 +4865,7 @@ long si_mem_available(void)
                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
        for_each_zone(zone)
-                wmark_low += zone->watermark[WMARK_LOW];
+                wmark_low += low_wmark_pages(zone);
        /*
         * Estimate the amount of memory available for userspace allocations,
@@ -4746,11 +4899,11 @@ EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
-        val->totalram = totalram_pages;
+        val->totalram = totalram_pages();
        val->sharedram = global_node_page_state(NR_SHMEM);
        val->freeram = global_zone_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
-        val->totalhigh = totalhigh_pages;
+        val->totalhigh = totalhigh_pages();
        val->freehigh = nr_free_highpages();
        val->mem_unit = PAGE_SIZE;
 }
@@ -4767,7 +4920,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        pg_data_t *pgdat = NODE_DATA(nid);
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
-                managed_pages += pgdat->node_zones[zone_type].managed_pages;
+                managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
        val->totalram = managed_pages;
        val->sharedram = node_page_state(pgdat, NR_SHMEM);
        val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +4929,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
                struct zone *zone = &pgdat->node_zones[zone_type];
                if (is_highmem(zone)) {
-                        managed_highpages += zone->managed_pages;
+                        managed_highpages += zone_managed_pages(zone);
                        free_highpages += zone_page_state(zone, NR_FREE_PAGES);
                }
        }
@@ -4983,7 +5136,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                        K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
                        K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
                        K(zone->present_pages),
-                        K(zone->managed_pages),
+                        K(zone_managed_pages(zone)),
                        K(zone_page_state(zone, NR_MLOCK)),
                        zone_page_state(zone, NR_KERNEL_STACK_KB),
                        K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5655,7 +5808,7 @@ static int zone_batchsize(struct zone *zone)
         * The per-cpu-pages pools are set to around 1000th of the
         * size of the zone.
         */
-        batch = zone->managed_pages / 1024;
+        batch = zone_managed_pages(zone) / 1024;
        /* But no more than a meg. */
        if (batch * PAGE_SIZE > 1024 * 1024)
                batch = (1024 * 1024) / PAGE_SIZE;
@@ -5736,7 +5889,6 @@ static void pageset_init(struct per_cpu_pageset *p)
        memset(p, 0, sizeof(*p));
        pcp = &p->pcp;
-        pcp->count = 0;
        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
                INIT_LIST_HEAD(&pcp->lists[migratetype]);
 }
@@ -5766,7 +5918,7 @@ static void pageset_set_high_and_batch(struct zone *zone,
 {
        if (percpu_pagelist_fraction)
                pageset_set_high(pcp,
-                        (zone->managed_pages /
+                        (zone_managed_pages(zone) /
                                percpu_pagelist_fraction));
        else
                pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5920,7 +6072,7 @@ void __init sparse_memory_present_with_active_regions(int nid)
 * with no available memory, a warning is printed and the start and end
 * PFNs will be 0.
 */
-void __meminit get_pfn_range_for_nid(unsigned int nid,
+void __init get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn)
 {
        unsigned long this_start_pfn, this_end_pfn;
@@ -5969,7 +6121,7 @@ static void __init find_usable_zone_for_movable(void)
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
 * zones within a node are in order of monotonic increases memory addresses
 */
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range_for_zone_movable(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6000,7 +6152,7 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
 * Return the number of pages a zone spans in a node, including holes
 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
 */
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __init zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6035,7 +6187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __init __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -6065,7 +6217,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 }
 /* Return the number of page frames in holes in a zone on a node */
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __init zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6117,7 +6269,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static inline unsigned long __init zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -6136,7 +6288,7 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
        return zones_size[zone_type];
 }
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+static inline unsigned long __init zone_absent_pages_in_node(int nid,
                                                unsigned long zone_type,
                                                unsigned long node_start_pfn,
                                                unsigned long node_end_pfn,
@@ -6150,7 +6302,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
                                                unsigned long node_start_pfn,
                                                unsigned long node_end_pfn,
                                                unsigned long *zones_size,
@@ -6323,7 +6475,7 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
 static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
                                                        unsigned long remaining_pages)
 {
-        zone->managed_pages = remaining_pages;
+        atomic_long_set(&zone->managed_pages, remaining_pages);
        zone_set_nid(zone, nid);
        zone->name = zone_names[idx];
        zone->zone_pgdat = NODE_DATA(nid);
@@ -6476,12 +6628,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { }
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
 {
-        /*
-         * We start only with one section of pages, more pages are added as
-         * needed until the rest of deferred pages are initialized.
-         */
-        pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
-                                                pgdat->node_spanned_pages);
        pgdat->first_deferred_pfn = ULONG_MAX;
 }
 #else
@@ -7075,18 +7221,16 @@ early_param("movablecore", cmdline_parse_movablecore);
 void adjust_managed_page_count(struct page *page, long count)
 {
-        spin_lock(&managed_page_count_lock);
+        atomic_long_add(count, &page_zone(page)->managed_pages);
-        page_zone(page)->managed_pages += count;
+        totalram_pages_add(count);
-        totalram_pages += count;
 #ifdef CONFIG_HIGHMEM
        if (PageHighMem(page))
-                totalhigh_pages += count;
+                totalhigh_pages_add(count);
 #endif
-        spin_unlock(&managed_page_count_lock);
 }
 EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
 {
        void *pos;
        unsigned long pages = 0;
@@ -7123,9 +7267,9 @@ EXPORT_SYMBOL(free_reserved_area);
 void free_highmem_page(struct page *page)
 {
        __free_reserved_page(page);
-        totalram_pages++;
+        totalram_pages_inc();
-        page_zone(page)->managed_pages++;
+        atomic_long_inc(&page_zone(page)->managed_pages);
-        totalhigh_pages++;
+        totalhigh_pages_inc();
 }
 #endif
@@ -7174,10 +7318,10 @@ void __init mem_init_print_info(const char *str)
                physpages << (PAGE_SHIFT - 10),
                codesize >> 10, datasize >> 10, rosize >> 10,
                (init_data_size + init_code_size) >> 10, bss_size >> 10,
-                (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+                (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
                totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef  CONFIG_HIGHMEM
-                totalhigh_pages << (PAGE_SHIFT - 10),
+                totalhigh_pages() << (PAGE_SHIFT - 10),
 #endif
                str ? ", " : "", str ? str : "");
 }
@@ -7257,6 +7401,7 @@ static void calculate_totalreserve_pages(void)
                for (i = 0; i < MAX_NR_ZONES; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        long max = 0;
+                        unsigned long managed_pages = zone_managed_pages(zone);
                        /* Find valid and maximum lowmem_reserve in the zone */
                        for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7267,8 +7412,8 @@ static void calculate_totalreserve_pages(void)
                        /* we treat the high watermark as reserved pages. */
                        max += high_wmark_pages(zone);
-                        if (max > zone->managed_pages)
+                        if (max > managed_pages)
-                                max = zone->managed_pages;
+                                max = managed_pages;
                        pgdat->totalreserve_pages += max;
@@ -7292,7 +7437,7 @@ static void setup_per_zone_lowmem_reserve(void)
        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
-                        unsigned long managed_pages = zone->managed_pages;
+                        unsigned long managed_pages = zone_managed_pages(zone);
                        zone->lowmem_reserve[j] = 0;
@@ -7310,7 +7455,7 @@ static void setup_per_zone_lowmem_reserve(void)
                                        lower_zone->lowmem_reserve[j] =
                                                managed_pages / sysctl_lowmem_reserve_ratio[idx];
                                }
-                                managed_pages += lower_zone->managed_pages;
+                                managed_pages += zone_managed_pages(lower_zone);
                        }
                }
        }
@@ -7329,14 +7474,14 @@ static void __setup_per_zone_wmarks(void)
        /* Calculate total number of !ZONE_HIGHMEM pages */
        for_each_zone(zone) {
                if (!is_highmem(zone))
-                        lowmem_pages += zone->managed_pages;
+                        lowmem_pages += zone_managed_pages(zone);
        }
        for_each_zone(zone) {
                u64 tmp;
                spin_lock_irqsave(&zone->lock, flags);
-                tmp = (u64)pages_min * zone->managed_pages;
+                tmp = (u64)pages_min * zone_managed_pages(zone);
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
                        /*
@@ -7350,15 +7495,15 @@ static void __setup_per_zone_wmarks(void)
                         */
                        unsigned long min_pages;
-                        min_pages = zone->managed_pages / 1024;
+                        min_pages = zone_managed_pages(zone) / 1024;
                        min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
-                        zone->watermark[WMARK_MIN] = min_pages;
+                        zone->_watermark[WMARK_MIN] = min_pages;
                } else {
                        /*
                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->watermark[WMARK_MIN] = tmp;
+                        zone->_watermark[WMARK_MIN] = tmp;
                }
                /*
@@ -7367,11 +7512,12 @@ static void __setup_per_zone_wmarks(void)
                 * ensure a minimum size on small systems.
                 */
                tmp = max_t(u64, tmp >> 2,
-                            mult_frac(zone->managed_pages,
+                            mult_frac(zone_managed_pages(zone),
                                      watermark_scale_factor, 10000));
-                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+                zone->_watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
-                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+                zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+                zone->watermark_boost = 0;
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -7472,6 +7618,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
        return 0;
 }
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int rc;
+        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (rc)
+                return rc;
+        return 0;
+}
 int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
@@ -7497,8 +7655,8 @@ static void setup_min_unmapped_ratio(void)
                pgdat->min_unmapped_pages = 0;
        for_each_zone(zone)
-                zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
+                zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
-                                sysctl_min_unmapped_ratio) / 100;
+                                                         sysctl_min_unmapped_ratio) / 100;
 }
@@ -7525,8 +7683,8 @@ static void setup_min_slab_ratio(void)
                pgdat->min_slab_pages = 0;
        for_each_zone(zone)
-                zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
+                zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
-                                sysctl_min_slab_ratio) / 100;
+                                                     sysctl_min_slab_ratio) / 100;
 }
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7766,8 +7924,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 * race condition. So you can't expect this function should be exact.
 */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
-                         int migratetype,
+                         int migratetype, int flags)
-                         bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, iter, found;
@@ -7841,7 +7998,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                 * The HWPoisoned page may be not in buddy system, and
                 * page_count() is not 0.
                 */
-                if (skip_hwpoisoned_pages && PageHWPoison(page))
+                if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
                        continue;
                if (__PageMovable(page))
@@ -7868,6 +8025,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
        return false;
 unmovable:
        WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+        if (flags & REPORT_FAILURE)
+                dump_page(pfn_to_page(pfn+iter), "unmovable page");
        return true;
 }
@@ -7994,8 +8153,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        ret = start_isolate_page_range(pfn_max_align_down(start),
-                                       pfn_max_align_up(end), migratetype,
+                                       pfn_max_align_up(end), migratetype, 0);
-                                       false);
        if (ret)
                return ret;