1 files changed, 183 insertions, 134 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3d603cef2c0..2302f250d6b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -65,6 +65,7 @@
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
+#include <linux/ftrace.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -291,6 +292,26 @@ int page_group_by_mobility_disabled __read_mostly;
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static inline void reset_deferred_meminit(pg_data_t *pgdat)
 {
+        unsigned long max_initialise;
+        unsigned long reserved_lowmem;
+        /*
+         * Initialise at least 2G of a node but also take into account that
+         * two large system hashes that can take up 1GB for 0.25TB/node.
+         */
+        max_initialise = max(2UL << (30 - PAGE_SHIFT),
+                (pgdat->node_spanned_pages >> 8));
+        /*
+         * Compensate the all the memblock reservations (e.g. crash kernel)
+         * from the initial estimation to make sure we will initialize enough
+         * memory to boot.
+         */
+        reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
+                        pgdat->node_start_pfn + max_initialise);
+        max_initialise += reserved_lowmem;
+        pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
        pgdat->first_deferred_pfn = ULONG_MAX;
 }
@@ -313,20 +334,11 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                unsigned long pfn, unsigned long zone_end,
                                unsigned long *nr_initialised)
 {
-        unsigned long max_initialise;
        /* Always populate low zones for address-contrained allocations */
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
-        /*
-         * Initialise at least 2G of a node but also take into account that
-         * two large system hashes that can take up 1GB for 0.25TB/node.
-         */
-        max_initialise = max(2UL << (30 - PAGE_SHIFT),
-                (pgdat->node_spanned_pages >> 8));
        (*nr_initialised)++;
-        if ((*nr_initialised > max_initialise) &&
+        if ((*nr_initialised > pgdat->static_init_size) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                pgdat->first_deferred_pfn = pfn;
                return false;
@@ -1090,14 +1102,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
        int migratetype = 0;
        int batch_free = 0;
-        unsigned long nr_scanned, flags;
        bool isolated_pageblocks;
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        isolated_pageblocks = has_isolate_pageblock(zone);
-        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-        if (nr_scanned)
-                __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
        while (count) {
                struct page *page;
@@ -1142,7 +1150,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        trace_mm_page_pcpu_drain(page, 0, mt);
                } while (--count && --batch_free && !list_empty(list));
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
 }
 static void free_one_page(struct zone *zone,
@@ -1150,19 +1158,13 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
-        unsigned long nr_scanned, flags;
+        spin_lock(&zone->lock);
-        spin_lock_irqsave(&zone->lock, flags);
-        __count_vm_events(PGFREE, 1 << order);
-        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
-        if (nr_scanned)
-                __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
        if (unlikely(has_isolate_pageblock(zone) ||
                is_migrate_isolate(migratetype))) {
                migratetype = get_pfnblock_migratetype(page, pfn);
        }
        __free_one_page(page, pfn, zone, order, migratetype);
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
 }
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1240,6 +1242,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
+        unsigned long flags;
        int migratetype;
        unsigned long pfn = page_to_pfn(page);
@@ -1247,7 +1250,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                return;
        migratetype = get_pfnblock_migratetype(page, pfn);
+        local_irq_save(flags);
+        __count_vm_events(PGFREE, 1 << order);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
+        local_irq_restore(flags);
 }
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -1695,10 +1701,10 @@ static inline int check_new_page(struct page *page)
        return 1;
 }
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
 {
        return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
-                page_poisoning_enabled() && poisoned;
+                page_poisoning_enabled();
 }
 #ifdef CONFIG_DEBUG_VM
@@ -1752,17 +1758,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
                                                        unsigned int alloc_flags)
 {
        int i;
-        bool poisoned = true;
-        for (i = 0; i < (1 << order); i++) {
-                struct page *p = page + i;
-                if (poisoned)
-                        poisoned &= page_is_poisoned(p);
-        }
        post_alloc_hook(page, order, gfp_flags);
-        if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+        if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
                for (i = 0; i < (1 << order); i++)
                        clear_highpage(page + i);
@@ -1844,9 +1843,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
                          struct page *start_page, struct page *end_page,
-                          int migratetype)
+                          int migratetype, int *num_movable)
 {
        struct page *page;
        unsigned int order;
@@ -1863,6 +1862,9 @@ int move_freepages(struct zone *zone,
        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
+        if (num_movable)
+                *num_movable = 0;
        for (page = start_page; page <= end_page;) {
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
@@ -1873,6 +1875,15 @@ int move_freepages(struct zone *zone,
                VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
                if (!PageBuddy(page)) {
+                        /*
+                         * We assume that pages that could be isolated for
+                         * migration are movable. But we don't actually try
+                         * isolating, as that would be expensive.
+                         */
+                        if (num_movable &&
+                                        (PageLRU(page) || __PageMovable(page)))
+                                (*num_movable)++;
                        page++;
                        continue;
                }
@@ -1888,7 +1899,7 @@ int move_freepages(struct zone *zone,
 }
 int move_freepages_block(struct zone *zone, struct page *page,
-                                int migratetype)
+                                int migratetype, int *num_movable)
 {
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
@@ -1905,7 +1916,8 @@ int move_freepages_block(struct zone *zone, struct page *page,
        if (!zone_spans_pfn(zone, end_pfn))
                return 0;
-        return move_freepages(zone, start_page, end_page, migratetype);
+        return move_freepages(zone, start_page, end_page, migratetype,
+                                                                num_movable);
 }
 static void change_pageblock_range(struct page *pageblock_page,
@@ -1955,28 +1967,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
 /*
 * This function implements actual steal behaviour. If order is large enough,
 * we can steal whole pageblock. If not, we first move freepages in this
- * pageblock and check whether half of pages are moved or not. If half of
+ * pageblock to our migratetype and determine how many already-allocated pages
- * pages are moved, we can change migratetype of pageblock and permanently
+ * are there in the pageblock with a compatible migratetype. If at least half
- * use it's pages as requested migratetype in the future.
+ * of pages are free or compatible, we can change migratetype of the pageblock
+ * itself, so pages freed in the future will be put on the correct free list.
 */
 static void steal_suitable_fallback(struct zone *zone, struct page *page,
-                                                          int start_type)
+                                        int start_type, bool whole_block)
 {
        unsigned int current_order = page_order(page);
-        int pages;
+        struct free_area *area;
+        int free_pages, movable_pages, alike_pages;
+        int old_block_type;
+        old_block_type = get_pageblock_migratetype(page);
+        /*
+         * This can happen due to races and we want to prevent broken
+         * highatomic accounting.
+         */
+        if (is_migrate_highatomic(old_block_type))
+                goto single_page;
        /* Take ownership for orders >= pageblock_order */
        if (current_order >= pageblock_order) {
                change_pageblock_range(page, current_order, start_type);
-                return;
+                goto single_page;
+        }
+        /* We are not allowed to try stealing from the whole block */
+        if (!whole_block)
+                goto single_page;
+        free_pages = move_freepages_block(zone, page, start_type,
+                                                &movable_pages);
+        /*
+         * Determine how many pages are compatible with our allocation.
+         * For movable allocation, it's the number of movable pages which
+         * we just obtained. For other types it's a bit more tricky.
+         */
+        if (start_type == MIGRATE_MOVABLE) {
+                alike_pages = movable_pages;
+        } else {
+                /*
+                 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
+                 * to MOVABLE pageblock, consider all non-movable pages as
+                 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
+                 * vice versa, be conservative since we can't distinguish the
+                 * exact migratetype of non-movable pages.
+                 */
+                if (old_block_type == MIGRATE_MOVABLE)
+                        alike_pages = pageblock_nr_pages
+                                                - (free_pages + movable_pages);
+                else
+                        alike_pages = 0;
        }
-        pages = move_freepages_block(zone, page, start_type);
+        /* moving whole block can fail due to zone boundary conditions */
+        if (!free_pages)
+                goto single_page;
-        /* Claim the whole block if over half of it is free */
+        /*
-        if (pages >= (1 << (pageblock_order-1)) ||
+         * If a sufficient number of pages in the block are either free or of
+         * comparable migratability as our allocation, claim the whole block.
+         */
+        if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
                        page_group_by_mobility_disabled)
                set_pageblock_migratetype(page, start_type);
+        return;
+single_page:
+        area = &zone->free_area[current_order];
+        list_move(&page->lru, &area->free_list[start_type]);
 }
 /*
@@ -2042,11 +2105,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
        /* Yoink! */
        mt = get_pageblock_migratetype(page);
-        if (mt != MIGRATE_HIGHATOMIC &&
+        if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
-                        !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+            && !is_migrate_cma(mt)) {
                zone->nr_reserved_highatomic += pageblock_nr_pages;
                set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
-                move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+                move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
        }
 out_unlock:
@@ -2100,8 +2163,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
                         * from highatomic to ac->migratetype. So we should
                         * adjust the count once.
                         */
-                        if (get_pageblock_migratetype(page) ==
+                        if (is_migrate_highatomic_page(page)) {
-                                                        MIGRATE_HIGHATOMIC) {
                                /*
                                 * It should never happen but changes to
                                 * locking could inadvertently allow a per-cpu
@@ -2124,7 +2186,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
                         * may increase.
                         */
                        set_pageblock_migratetype(page, ac->migratetype);
-                        ret = move_freepages_block(zone, page, ac->migratetype);
+                        ret = move_freepages_block(zone, page, ac->migratetype,
+                                                                        NULL);
                        if (ret) {
                                spin_unlock_irqrestore(&zone->lock, flags);
                                return ret;
@@ -2136,8 +2199,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
        return false;
 }
-/* Remove an element from the buddy allocator from the fallback list */
+/*
-static inline struct page *
+ * Try finding a free buddy page on the fallback list and put it on the free
+ * list of requested migratetype, possibly along with other pages from the same
+ * block, depending on fragmentation avoidance heuristics. Returns true if
+ * fallback was found so that __rmqueue_smallest() can grab it.
+ */
+static inline bool
 __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
 {
        struct free_area *area;
@@ -2158,33 +2226,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                page = list_first_entry(&area->free_list[fallback_mt],
                                                struct page, lru);
-                if (can_steal &&
-                        get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
-                        steal_suitable_fallback(zone, page, start_migratetype);
-                /* Remove the page from the freelists */
+                steal_suitable_fallback(zone, page, start_migratetype,
-                area->nr_free--;
+                                                                can_steal);
-                list_del(&page->lru);
-                rmv_page_order(page);
-                expand(zone, page, order, current_order, area,
-                                        start_migratetype);
-                /*
-                 * The pcppage_migratetype may differ from pageblock's
-                 * migratetype depending on the decisions in
-                 * find_suitable_fallback(). This is OK as long as it does not
-                 * differ for MIGRATE_CMA pageblocks. Those can be used as
-                 * fallback only via special __rmqueue_cma_fallback() function
-                 */
-                set_pcppage_migratetype(page, start_migratetype);
                trace_mm_page_alloc_extfrag(page, order, current_order,
                        start_migratetype, fallback_mt);
-                return page;
+                return true;
        }
-        return NULL;
+        return false;
 }
 /*
@@ -2196,13 +2248,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
 {
        struct page *page;
+retry:
        page = __rmqueue_smallest(zone, order, migratetype);
        if (unlikely(!page)) {
                if (migratetype == MIGRATE_MOVABLE)
                        page = __rmqueue_cma_fallback(zone, order);
-                if (!page)
+                if (!page && __rmqueue_fallback(zone, order, migratetype))
-                        page = __rmqueue_fallback(zone, order, migratetype);
+                        goto retry;
        }
        trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -2219,9 +2272,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        int migratetype, bool cold)
 {
        int i, alloced = 0;
-        unsigned long flags;
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
                struct page *page = __rmqueue(zone, order, migratetype);
                if (unlikely(page == NULL))
@@ -2257,7 +2309,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         * pages added to the pcp list.
         */
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
        return alloced;
 }
@@ -2485,25 +2537,22 @@ void free_hot_cold_page(struct page *page, bool cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
+        unsigned long flags;
        unsigned long pfn = page_to_pfn(page);
        int migratetype;
-        if (in_interrupt()) {
-                __free_pages_ok(page, 0);
-                return;
-        }
        if (!free_pcp_prepare(page))
                return;
        migratetype = get_pfnblock_migratetype(page, pfn);
        set_pcppage_migratetype(page, migratetype);
-        preempt_disable();
+        local_irq_save(flags);
+        __count_vm_event(PGFREE);
        /*
         * We only track unmovable, reclaimable and movable on pcp lists.
         * Free ISOLATE pages back to the allocator because they are being
-         * offlined but treat RESERVE as movable pages so we can get those
+         * offlined but treat HIGHATOMIC as movable pages so we can get those
         * areas back if necessary. Otherwise, we may have to free
         * excessively into the page allocator
         */
@@ -2515,7 +2564,6 @@ void free_hot_cold_page(struct page *page, bool cold)
                migratetype = MIGRATE_MOVABLE;
        }
-        __count_vm_event(PGFREE);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        if (!cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2529,7 +2577,7 @@ void free_hot_cold_page(struct page *page, bool cold)
        }
 out:
-        preempt_enable();
+        local_irq_restore(flags);
 }
 /*
@@ -2614,7 +2662,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                for (; page < endpage; page += pageblock_nr_pages) {
                        int mt = get_pageblock_migratetype(page);
                        if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
-                                && mt != MIGRATE_HIGHATOMIC)
+                            && !is_migrate_highatomic(mt))
                                set_pageblock_migratetype(page,
                                                          MIGRATE_MOVABLE);
                }
@@ -2654,8 +2702,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 {
        struct page *page;
-        VM_BUG_ON(in_interrupt());
        do {
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
@@ -2686,8 +2732,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
        struct list_head *list;
        bool cold = ((gfp_flags & __GFP_COLD) != 0);
        struct page *page;
+        unsigned long flags;
-        preempt_disable();
+        local_irq_save(flags);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        list = &pcp->lists[migratetype];
        page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
@@ -2695,7 +2742,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
                zone_statistics(preferred_zone, zone);
        }
-        preempt_enable();
+        local_irq_restore(flags);
        return page;
 }
@@ -2711,7 +2758,7 @@ struct page *rmqueue(struct zone *preferred_zone,
        unsigned long flags;
        struct page *page;
-        if (likely(order == 0) && !in_interrupt()) {
+        if (likely(order == 0)) {
                page = rmqueue_pcplist(preferred_zone, zone, order,
                                gfp_flags, migratetype);
                goto out;
@@ -3113,8 +3160,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
        static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
-        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
-            debug_guardpage_minorder() > 0)
                return;
        pr_warn("%s: ", current->comm);
@@ -3248,14 +3294,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                enum compact_priority prio, enum compact_result *compact_result)
 {
        struct page *page;
+        unsigned int noreclaim_flag;
        if (!order)
                return NULL;
-        current->flags |= PF_MEMALLOC;
+        noreclaim_flag = memalloc_noreclaim_save();
        *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
                                                                        prio);
-        current->flags &= ~PF_MEMALLOC;
+        memalloc_noreclaim_restore(noreclaim_flag);
        if (*compact_result <= COMPACT_INACTIVE)
                return NULL;
@@ -3402,12 +3449,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct reclaim_state reclaim_state;
        int progress;
+        unsigned int noreclaim_flag;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        current->flags |= PF_MEMALLOC;
+        noreclaim_flag = memalloc_noreclaim_save();
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
@@ -3417,7 +3465,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        current->flags &= ~PF_MEMALLOC;
+        memalloc_noreclaim_restore(noreclaim_flag);
        cond_resched();
@@ -3525,19 +3573,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 }
 /*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-/*
 * Checks whether it makes sense to retry the reclaim to make a forward progress
 * for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
+ *
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
- * any progress in a row) is considered as well as the reclaimable pages on the
+ * without success, or when we couldn't even meet the watermark if we
- * applicable zone list (with a backoff mechanism which is a function of
+ * reclaimed all remaining pages on the LRU lists.
- * no_progress_loops).
 *
 * Returns true if a retry is viable or false to enter the oom path.
 */
@@ -3582,13 +3623,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                bool wmark;
                available = reclaimable = zone_reclaimable_pages(zone);
-                available -= DIV_ROUND_UP((*no_progress_loops) * available,
-                                          MAX_RECLAIM_RETRIES);
                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
                /*
-                 * Would the allocation succeed if we reclaimed the whole
+                 * Would the allocation succeed if we reclaimed all
-                 * available?
+                 * reclaimable pages?
                 */
                wmark = __zone_watermark_ok(zone, order, min_wmark,
                                ac_classzone_idx(ac), alloc_flags, available);
@@ -3639,6 +3678,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                struct alloc_context *ac)
 {
        bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
+        const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
        struct page *page = NULL;
        unsigned int alloc_flags;
        unsigned long did_some_progress;
@@ -3706,12 +3746,17 @@ retry_cpuset:
        /*
         * For costly allocations, try direct compaction first, as it's likely
-         * that we have enough base pages and don't need to reclaim. Don't try
+         * that we have enough base pages and don't need to reclaim. For non-
-         * that for allocations that are allowed to ignore watermarks, as the
+         * movable high-order allocations, do that as well, as compaction will
-         * ALLOC_NO_WATERMARKS attempt didn't yet happen.
+         * try prevent permanent fragmentation by migrating from blocks of the
+         * same migratetype.
+         * Don't try this for allocations that are allowed to ignore
+         * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
         */
-        if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER &&
+        if (can_direct_reclaim &&
-                !gfp_pfmemalloc_allowed(gfp_mask)) {
+                        (costly_order ||
+                           (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
+                        && !gfp_pfmemalloc_allowed(gfp_mask)) {
                page = __alloc_pages_direct_compact(gfp_mask, order,
                                                alloc_flags, ac,
                                                INIT_COMPACT_PRIORITY,
@@ -3723,7 +3768,7 @@ retry_cpuset:
                 * Checks for costly allocations with __GFP_NORETRY, which
                 * includes THP page fault allocations
                 */
-                if (gfp_mask & __GFP_NORETRY) {
+                if (costly_order && (gfp_mask & __GFP_NORETRY)) {
                        /*
                         * If compaction is deferred for high-order allocations,
                         * it is because sync compaction recently failed. If
@@ -3774,7 +3819,7 @@ retry:
        /* Make sure we know about allocations which stall for too long */
        if (time_after(jiffies, alloc_start + stall_timeout)) {
-                warn_alloc(gfp_mask, ac->nodemask,
+                warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
                        "page allocation stalls for %ums, order:%u",
                        jiffies_to_msecs(jiffies-alloc_start), order);
                stall_timeout += 10 * HZ;
@@ -3804,7 +3849,7 @@ retry:
         * Do not retry costly high order allocations unless they are
         * __GFP_REPEAT
         */
-        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+        if (costly_order && !(gfp_mask & __GFP_REPEAT))
                goto nopage;
        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
@@ -3836,7 +3881,9 @@ retry:
                goto got_pg;
        /* Avoid allocations with no watermarks from looping endlessly */
-        if (test_thread_flag(TIF_MEMDIE))
+        if (test_thread_flag(TIF_MEMDIE) &&
+            (alloc_flags == ALLOC_NO_WATERMARKS ||
+             (gfp_mask & __GFP_NOMEMALLOC)))
                goto nopage;
        /* Retry as long as the OOM killer is making progress */
@@ -3974,10 +4021,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                goto out;
        /*
-         * Runtime PM, block IO and its error handling path can deadlock
+         * Apply scoped allocation constraints. This is mainly about GFP_NOFS
-         * because I/O on the device might not complete.
+         * resp. GFP_NOIO which has to be inherited for all allocation requests
+         * from a particular context which has been marked by
+         * memalloc_no{fs,io}_{save,restore}.
         */
-        alloc_mask = memalloc_noio_flags(gfp_mask);
+        alloc_mask = current_gfp_context(gfp_mask);
        ac.spread_dirty_pages = false;
        /*
@@ -4250,7 +4299,8 @@ EXPORT_SYMBOL(free_pages_exact);
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
- *     managed_pages - high_pages
+ *
+ *     nr_free_zone_pages = managed_pages - high_pages
 */
 static unsigned long nr_free_zone_pages(int offset)
 {
@@ -4512,7 +4562,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
                        " writeback_tmp:%lukB"
                        " unstable:%lukB"
-                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
                        "\n",
                        pgdat->node_id,
@@ -4535,8 +4584,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 #endif
                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
                        K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
-                        node_page_state(pgdat, NR_PAGES_SCANNED),
+                        pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
-                        !pgdat_reclaimable(pgdat) ? "yes" : "no");
+                                "yes" : "no");
        }
        for_each_populated_zone(zone) {
@@ -6100,7 +6149,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        /* pg_data_t should be reset to zero when it's allocated */
        WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
-        reset_deferred_meminit(pgdat);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        pgdat->per_cpu_nodestats = NULL;
@@ -6122,6 +6170,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                (unsigned long)pgdat->node_mem_map);
 #endif
+        reset_deferred_meminit(pgdat);
        free_area_init_core(pgdat);
 }
@@ -7431,7 +7480,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                .zone = page_zone(pfn_to_page(start)),
                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
-                .gfp_mask = memalloc_noio_flags(gfp_mask),
+                .gfp_mask = current_gfp_context(gfp_mask),
        };
        INIT_LIST_HEAD(&cc.migratepages);