1 files changed, 408 insertions, 259 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3e0c69a97b7..eaa64d2ffdc5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,12 +55,13 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
+#include <trace/events/oom.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
-#include <linux/page_ext.h>
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/mm.h>
 #include <linux/page_owner.h>
 #include <linux/kthread.h>
 #include <linux/memcontrol.h>
@@ -91,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 int _node_numa_mem_[MAX_NUMNODES];
 #endif
+/* work_structs for global per-cpu drains */
+DEFINE_MUTEX(pcpu_drain_mutex);
+DEFINE_PER_CPU(struct work_struct, pcpu_drain);
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
 EXPORT_SYMBOL(latent_entropy);
@@ -714,7 +719,7 @@ static inline void rmv_page_order(struct page *page)
 /*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
- * (a) the buddy is not in a hole &&
+ * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
@@ -729,9 +734,6 @@ static inline void rmv_page_order(struct page *page)
 static inline int page_is_buddy(struct page *page, struct page *buddy,
                                                        unsigned int order)
 {
-        if (!pfn_valid_within(page_to_pfn(buddy)))
-                return 0;
        if (page_is_guard(buddy) && page_order(buddy) == order) {
                if (page_zone_id(page) != page_zone_id(buddy))
                        return 0;
@@ -787,9 +789,8 @@ static inline void __free_one_page(struct page *page,
                struct zone *zone, unsigned int order,
                int migratetype)
 {
-        unsigned long page_idx;
+        unsigned long combined_pfn;
-        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_pfn);
-        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        unsigned int max_order;
@@ -802,15 +803,16 @@ static inline void __free_one_page(struct page *page,
        if (likely(!is_migrate_isolate(migratetype)))
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
-        page_idx = pfn & ((1 << MAX_ORDER) - 1);
+        VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
-        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
 continue_merging:
        while (order < max_order - 1) {
-                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy_pfn = __find_buddy_pfn(pfn, order);
-                buddy = page + (buddy_idx - page_idx);
+                buddy = page + (buddy_pfn - pfn);
+                if (!pfn_valid_within(buddy_pfn))
+                        goto done_merging;
                if (!page_is_buddy(page, buddy, order))
                        goto done_merging;
                /*
@@ -824,9 +826,9 @@ continue_merging:
                        zone->free_area[order].nr_free--;
                        rmv_page_order(buddy);
                }
-                combined_idx = buddy_idx & page_idx;
+                combined_pfn = buddy_pfn & pfn;
-                page = page + (combined_idx - page_idx);
+                page = page + (combined_pfn - pfn);
-                page_idx = combined_idx;
+                pfn = combined_pfn;
                order++;
        }
        if (max_order < MAX_ORDER) {
@@ -841,8 +843,8 @@ continue_merging:
                if (unlikely(has_isolate_pageblock(zone))) {
                        int buddy_mt;
-                        buddy_idx = __find_buddy_index(page_idx, order);
+                        buddy_pfn = __find_buddy_pfn(pfn, order);
-                        buddy = page + (buddy_idx - page_idx);
+                        buddy = page + (buddy_pfn - pfn);
                        buddy_mt = get_pageblock_migratetype(buddy);
                        if (migratetype != buddy_mt
@@ -865,12 +867,12 @@ done_merging:
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
+        if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = buddy_idx & page_idx;
+                combined_pfn = buddy_pfn & pfn;
-                higher_page = page + (combined_idx - page_idx);
+                higher_page = page + (combined_pfn - pfn);
-                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
-                higher_buddy = higher_page + (buddy_idx - combined_idx);
+                higher_buddy = higher_page + (buddy_pfn - combined_pfn);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -1087,10 +1089,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
        int migratetype = 0;
        int batch_free = 0;
-        unsigned long nr_scanned;
+        unsigned long nr_scanned, flags;
        bool isolated_pageblocks;
-        spin_lock(&zone->lock);
+        spin_lock_irqsave(&zone->lock, flags);
        isolated_pageblocks = has_isolate_pageblock(zone);
        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
@@ -1139,7 +1141,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        trace_mm_page_pcpu_drain(page, 0, mt);
                } while (--count && --batch_free && !list_empty(list));
        }
-        spin_unlock(&zone->lock);
+        spin_unlock_irqrestore(&zone->lock, flags);
 }
 static void free_one_page(struct zone *zone,
@@ -1147,8 +1149,9 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
-        unsigned long nr_scanned;
+        unsigned long nr_scanned, flags;
-        spin_lock(&zone->lock);
+        spin_lock_irqsave(&zone->lock, flags);
+        __count_vm_events(PGFREE, 1 << order);
        nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
        if (nr_scanned)
                __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
@@ -1158,7 +1161,7 @@ static void free_one_page(struct zone *zone,
                migratetype = get_pfnblock_migratetype(page, pfn);
        }
        __free_one_page(page, pfn, zone, order, migratetype);
-        spin_unlock(&zone->lock);
+        spin_unlock_irqrestore(&zone->lock, flags);
 }
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1236,7 +1239,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
-        unsigned long flags;
        int migratetype;
        unsigned long pfn = page_to_pfn(page);
@@ -1244,10 +1246,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                return;
        migratetype = get_pfnblock_migratetype(page, pfn);
-        local_irq_save(flags);
-        __count_vm_events(PGFREE, 1 << order);
        free_one_page(page_zone(page), page, pfn, order, migratetype);
-        local_irq_restore(flags);
 }
 static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -2219,8 +2218,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        int migratetype, bool cold)
 {
        int i, alloced = 0;
+        unsigned long flags;
-        spin_lock(&zone->lock);
+        spin_lock_irqsave(&zone->lock, flags);
        for (i = 0; i < count; ++i) {
                struct page *page = __rmqueue(zone, order, migratetype);
                if (unlikely(page == NULL))
@@ -2256,7 +2256,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
         * pages added to the pcp list.
         */
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
-        spin_unlock(&zone->lock);
+        spin_unlock_irqrestore(&zone->lock, flags);
        return alloced;
 }
@@ -2341,16 +2341,26 @@ void drain_local_pages(struct zone *zone)
                drain_pages(cpu);
 }
+static void drain_local_pages_wq(struct work_struct *work)
+{
+        /*
+         * drain_all_pages doesn't use proper cpu hotplug protection so
+         * we can race with cpu offline when the WQ can move this from
+         * a cpu pinned worker to an unbound one. We can operate on a different
+         * cpu which is allright but we also have to make sure to not move to
+         * a different one.
+         */
+        preempt_disable();
+        drain_local_pages(NULL);
+        preempt_enable();
+}
 /*
 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
 *
 * When zone parameter is non-NULL, spill just the single zone's pages.
 *
- * Note that this code is protected against sending an IPI to an offline
+ * Note that this can be extremely slow as the draining happens in a workqueue.
- * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
- * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
- * nothing keeps CPUs from showing up after we populated the cpumask and
- * before the call to on_each_cpu_mask().
 */
 void drain_all_pages(struct zone *zone)
 {
@@ -2362,6 +2372,21 @@ void drain_all_pages(struct zone *zone)
         */
        static cpumask_t cpus_with_pcps;
+        /* Workqueues cannot recurse */
+        if (current->flags & PF_WQ_WORKER)
+                return;
+        /*
+         * Do not drain if one is already in progress unless it's specific to
+         * a zone. Such callers are primarily CMA and memory hotplug and need
+         * the drain to be complete when the call returns.
+         */
+        if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
+                if (!zone)
+                        return;
+                mutex_lock(&pcpu_drain_mutex);
+        }
        /*
         * We don't care about racing with CPU hotplug event
         * as offline notification will cause the notified
@@ -2392,8 +2417,16 @@ void drain_all_pages(struct zone *zone)
                else
                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
        }
-        on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
-                                                                zone, 1);
+        for_each_cpu(cpu, &cpus_with_pcps) {
+                struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
+                INIT_WORK(work, drain_local_pages_wq);
+                schedule_work_on(cpu, work);
+        }
+        for_each_cpu(cpu, &cpus_with_pcps)
+                flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+        mutex_unlock(&pcpu_drain_mutex);
 }
 #ifdef CONFIG_HIBERNATION
@@ -2444,17 +2477,20 @@ void free_hot_cold_page(struct page *page, bool cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
-        unsigned long flags;
        unsigned long pfn = page_to_pfn(page);
        int migratetype;
+        if (in_interrupt()) {
+                __free_pages_ok(page, 0);
+                return;
+        }
        if (!free_pcp_prepare(page))
                return;
        migratetype = get_pfnblock_migratetype(page, pfn);
        set_pcppage_migratetype(page, migratetype);
-        local_irq_save(flags);
+        preempt_disable();
-        __count_vm_event(PGFREE);
        /*
         * We only track unmovable, reclaimable and movable on pcp lists.
@@ -2471,6 +2507,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                migratetype = MIGRATE_MOVABLE;
        }
+        __count_vm_event(PGFREE);
        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        if (!cold)
                list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2484,7 +2521,7 @@ void free_hot_cold_page(struct page *page, bool cold)
        }
 out:
-        local_irq_restore(flags);
+        preempt_enable();
 }
 /*
@@ -2602,74 +2639,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 }
+/* Remove page from the per-cpu list, caller must protect the list */
+static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+                        bool cold, struct per_cpu_pages *pcp,
+                        struct list_head *list)
+{
+        struct page *page;
+        VM_BUG_ON(in_interrupt());
+        do {
+                if (list_empty(list)) {
+                        pcp->count += rmqueue_bulk(zone, 0,
+                                        pcp->batch, list,
+                                        migratetype, cold);
+                        if (unlikely(list_empty(list)))
+                                return NULL;
+                }
+                if (cold)
+                        page = list_last_entry(list, struct page, lru);
+                else
+                        page = list_first_entry(list, struct page, lru);
+                list_del(&page->lru);
+                pcp->count--;
+        } while (check_new_pcp(page));
+        return page;
+}
+/* Lock and remove page from the per-cpu list */
+static struct page *rmqueue_pcplist(struct zone *preferred_zone,
+                        struct zone *zone, unsigned int order,
+                        gfp_t gfp_flags, int migratetype)
+{
+        struct per_cpu_pages *pcp;
+        struct list_head *list;
+        bool cold = ((gfp_flags & __GFP_COLD) != 0);
+        struct page *page;
+        preempt_disable();
+        pcp = &this_cpu_ptr(zone->pageset)->pcp;
+        list = &pcp->lists[migratetype];
+        page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+        if (page) {
+                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+                zone_statistics(preferred_zone, zone);
+        }
+        preempt_enable();
+        return page;
+}
 /*
 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
 */
 static inline
-struct page *buffered_rmqueue(struct zone *preferred_zone,
+struct page *rmqueue(struct zone *preferred_zone,
                        struct zone *zone, unsigned int order,
                        gfp_t gfp_flags, unsigned int alloc_flags,
                        int migratetype)
 {
        unsigned long flags;
        struct page *page;
-        bool cold = ((gfp_flags & __GFP_COLD) != 0);
-        if (likely(order == 0)) {
+        if (likely(order == 0) && !in_interrupt()) {
-                struct per_cpu_pages *pcp;
+                page = rmqueue_pcplist(preferred_zone, zone, order,
-                struct list_head *list;
+                                gfp_flags, migratetype);
+                goto out;
-                local_irq_save(flags);
+        }
-                do {
-                        pcp = &this_cpu_ptr(zone->pageset)->pcp;
-                        list = &pcp->lists[migratetype];
-                        if (list_empty(list)) {
-                                pcp->count += rmqueue_bulk(zone, 0,
-                                                pcp->batch, list,
-                                                migratetype, cold);
-                                if (unlikely(list_empty(list)))
-                                        goto failed;
-                        }
-                        if (cold)
-                                page = list_last_entry(list, struct page, lru);
-                        else
-                                page = list_first_entry(list, struct page, lru);
-                        list_del(&page->lru);
-                        pcp->count--;
-                } while (check_new_pcp(page));
+        /*
-        } else {
+         * We most definitely don't want callers attempting to
-                /*
+         * allocate greater than order-1 page units with __GFP_NOFAIL.
-                 * We most definitely don't want callers attempting to
+         */
-                 * allocate greater than order-1 page units with __GFP_NOFAIL.
+        WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-                 */
+        spin_lock_irqsave(&zone->lock, flags);
-                WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-                spin_lock_irqsave(&zone->lock, flags);
-                do {
+        do {
-                        page = NULL;
+                page = NULL;
-                        if (alloc_flags & ALLOC_HARDER) {
+                if (alloc_flags & ALLOC_HARDER) {
-                                page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                        page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-                                if (page)
+                        if (page)
-                                        trace_mm_page_alloc_zone_locked(page, order, migratetype);
+                                trace_mm_page_alloc_zone_locked(page, order, migratetype);
-                        }
+                }
-                        if (!page)
-                                page = __rmqueue(zone, order, migratetype);
-                } while (page && check_new_pages(page, order));
-                spin_unlock(&zone->lock);
                if (!page)
-                        goto failed;
+                        page = __rmqueue(zone, order, migratetype);
-                __mod_zone_freepage_state(zone, -(1 << order),
+        } while (page && check_new_pages(page, order));
-                                          get_pcppage_migratetype(page));
+        spin_unlock(&zone->lock);
-        }
+        if (!page)
+                goto failed;
+        __mod_zone_freepage_state(zone, -(1 << order),
+                                  get_pcppage_migratetype(page));
        __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
        zone_statistics(preferred_zone, zone);
        local_irq_restore(flags);
-        VM_BUG_ON_PAGE(bad_range(zone, page), page);
+out:
+        VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
        return page;
 failed:
@@ -2877,7 +2945,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 #ifdef CONFIG_NUMA
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
-        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
+        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
                                RECLAIM_DISTANCE;
 }
 #else   /* CONFIG_NUMA */
@@ -2974,7 +3042,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
                }
 try_this_zone:
-                page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
+                page = rmqueue(ac->preferred_zoneref->zone, zone, order,
                                gfp_mask, alloc_flags, ac->migratetype);
                if (page) {
                        prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3007,18 +3075,12 @@ static inline bool should_suppress_show_mem(void)
        return ret;
 }
-static DEFINE_RATELIMIT_STATE(nopage_rs,
+static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
-                DEFAULT_RATELIMIT_INTERVAL,
-                DEFAULT_RATELIMIT_BURST);
-void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
-        struct va_format vaf;
+        static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
-        va_list args;
-        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+        if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
-            debug_guardpage_minorder() > 0)
                return;
        /*
@@ -3033,6 +3095,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
        if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                filter &= ~SHOW_MEM_FILTER_NODES;
+        show_mem(filter, nodemask);
+}
+void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
+                                      DEFAULT_RATELIMIT_BURST);
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+            debug_guardpage_minorder() > 0)
+                return;
        pr_warn("%s: ", current->comm);
        va_start(args, fmt);
@@ -3041,11 +3117,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
        pr_cont("%pV", &vaf);
        va_end(args);
-        pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask);
+        pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
+        if (nodemask)
+                pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
+        else
+                pr_cont("(null)\n");
+        cpuset_print_current_mems_allowed();
        dump_stack();
-        if (!should_suppress_show_mem())
+        warn_alloc_show_mem(gfp_mask, nodemask);
-                show_mem(filter);
+}
+static inline struct page *
+__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
+                              unsigned int alloc_flags,
+                              const struct alloc_context *ac)
+{
+        struct page *page;
+        page = get_page_from_freelist(gfp_mask, order,
+                        alloc_flags|ALLOC_CPUSET, ac);
+        /*
+         * fallback to ignore cpuset restriction if our nodes
+         * are depleted
+         */
+        if (!page)
+                page = get_page_from_freelist(gfp_mask, order,
+                                alloc_flags, ac);
+        return page;
 }
 static inline struct page *
@@ -3083,47 +3184,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        if (page)
                goto out;
-        if (!(gfp_mask & __GFP_NOFAIL)) {
+        /* Coredumps can quickly deplete all memory reserves */
-                /* Coredumps can quickly deplete all memory reserves */
+        if (current->flags & PF_DUMPCORE)
-                if (current->flags & PF_DUMPCORE)
+                goto out;
-                        goto out;
+        /* The OOM killer will not help higher order allocs */
-                /* The OOM killer will not help higher order allocs */
+        if (order > PAGE_ALLOC_COSTLY_ORDER)
-                if (order > PAGE_ALLOC_COSTLY_ORDER)
+                goto out;
-                        goto out;
+        /* The OOM killer does not needlessly kill tasks for lowmem */
-                /* The OOM killer does not needlessly kill tasks for lowmem */
+        if (ac->high_zoneidx < ZONE_NORMAL)
-                if (ac->high_zoneidx < ZONE_NORMAL)
+                goto out;
-                        goto out;
+        if (pm_suspended_storage())
-                if (pm_suspended_storage())
+                goto out;
-                        goto out;
+        /*
-                /*
+         * XXX: GFP_NOFS allocations should rather fail than rely on
-                 * XXX: GFP_NOFS allocations should rather fail than rely on
+         * other request to make a forward progress.
-                 * other request to make a forward progress.
+         * We are in an unfortunate situation where out_of_memory cannot
-                 * We are in an unfortunate situation where out_of_memory cannot
+         * do much for this context but let's try it to at least get
-                 * do much for this context but let's try it to at least get
+         * access to memory reserved if the current task is killed (see
-                 * access to memory reserved if the current task is killed (see
+         * out_of_memory). Once filesystems are ready to handle allocation
-                 * out_of_memory). Once filesystems are ready to handle allocation
+         * failures more gracefully we should just bail out here.
-                 * failures more gracefully we should just bail out here.
+         */
-                 */
+        /* The OOM killer may not free memory on a specific node */
+        if (gfp_mask & __GFP_THISNODE)
+                goto out;
-                /* The OOM killer may not free memory on a specific node */
-                if (gfp_mask & __GFP_THISNODE)
-                        goto out;
-        }
        /* Exhausted what can be done so it's blamo time */
        if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
                *did_some_progress = 1;
-                if (gfp_mask & __GFP_NOFAIL) {
+                /*
-                        page = get_page_from_freelist(gfp_mask, order,
+                 * Help non-failing allocations by giving them access to memory
-                                        ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+                 * reserves
-                        /*
+                 */
-                         * fallback to ignore cpuset restriction if our nodes
+                if (gfp_mask & __GFP_NOFAIL)
-                         * are depleted
+                        page = __alloc_pages_cpuset_fallback(gfp_mask, order,
-                         */
-                        if (!page)
-                                page = get_page_from_freelist(gfp_mask, order,
                                        ALLOC_NO_WATERMARKS, ac);
-                }
        }
 out:
        mutex_unlock(&oom_lock);
@@ -3192,6 +3288,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 {
        int max_retries = MAX_COMPACT_RETRIES;
        int min_priority;
+        bool ret = false;
+        int retries = *compaction_retries;
+        enum compact_priority priority = *compact_priority;
        if (!order)
                return false;
@@ -3213,8 +3312,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
         * But do not retry if the given zonelist is not suitable for
         * compaction.
         */
-        if (compaction_withdrawn(compact_result))
+        if (compaction_withdrawn(compact_result)) {
-                return compaction_zonelist_suitable(ac, order, alloc_flags);
+                ret = compaction_zonelist_suitable(ac, order, alloc_flags);
+                goto out;
+        }
        /*
         * !costly requests are much more important than __GFP_REPEAT
@@ -3226,8 +3327,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
         */
        if (order > PAGE_ALLOC_COSTLY_ORDER)
                max_retries /= 4;
-        if (*compaction_retries <= max_retries)
+        if (*compaction_retries <= max_retries) {
-                return true;
+                ret = true;
+                goto out;
+        }
        /*
         * Make sure there are attempts at the highest priority if we exhausted
@@ -3236,12 +3339,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
 check_priority:
        min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
                        MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
        if (*compact_priority > min_priority) {
                (*compact_priority)--;
                *compaction_retries = 0;
-                return true;
+                ret = true;
        }
-        return false;
+out:
+        trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
+        return ret;
 }
 #else
 static inline struct page *
@@ -3464,6 +3570,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                                        ac->nodemask) {
                unsigned long available;
                unsigned long reclaimable;
+                unsigned long min_wmark = min_wmark_pages(zone);
+                bool wmark;
                available = reclaimable = zone_reclaimable_pages(zone);
                available -= DIV_ROUND_UP((*no_progress_loops) * available,
@@ -3474,8 +3582,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                 * Would the allocation succeed if we reclaimed the whole
                 * available?
                 */
-                if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+                wmark = __zone_watermark_ok(zone, order, min_wmark,
-                                ac_classzone_idx(ac), alloc_flags, available)) {
+                                ac_classzone_idx(ac), alloc_flags, available);
+                trace_reclaim_retry_zone(z, order, reclaimable,
+                                available, min_wmark, *no_progress_loops, wmark);
+                if (wmark) {
                        /*
                         * If we didn't make any progress and have a lot of
                         * dirty + writeback pages then we should wait for
@@ -3555,6 +3666,14 @@ retry_cpuset:
        no_progress_loops = 0;
        compact_priority = DEF_COMPACT_PRIORITY;
        cpuset_mems_cookie = read_mems_allowed_begin();
+        /*
+         * The fast path uses conservative alloc_flags to succeed only until
+         * kswapd needs to be woken up, and to avoid the cost of setting up
+         * alloc_flags precisely. So we do that now.
+         */
+        alloc_flags = gfp_to_alloc_flags(gfp_mask);
        /*
         * We need to recalculate the starting point for the zonelist iterator
         * because we might have used different nodemask in the fast path, or
@@ -3566,14 +3685,6 @@ retry_cpuset:
        if (!ac->preferred_zoneref->zone)
                goto nopage;
-        /*
-         * The fast path uses conservative alloc_flags to succeed only until
-         * kswapd needs to be woken up, and to avoid the cost of setting up
-         * alloc_flags precisely. So we do that now.
-         */
-        alloc_flags = gfp_to_alloc_flags(gfp_mask);
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
@@ -3650,35 +3761,21 @@ retry:
                goto got_pg;
        /* Caller is not willing to reclaim, we can't balance anything */
-        if (!can_direct_reclaim) {
+        if (!can_direct_reclaim)
-                /*
-                 * All existing users of the __GFP_NOFAIL are blockable, so warn
-                 * of any new users that actually allow this type of allocation
-                 * to fail.
-                 */
-                WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                goto nopage;
-        }
-        /* Avoid recursion of direct reclaim */
+        /* Make sure we know about allocations which stall for too long */
-        if (current->flags & PF_MEMALLOC) {
+        if (time_after(jiffies, alloc_start + stall_timeout)) {
-                /*
+                warn_alloc(gfp_mask, ac->nodemask,
-                 * __GFP_NOFAIL request from this context is rather bizarre
+                        "page allocation stalls for %ums, order:%u",
-                 * because we cannot reclaim anything and only can loop waiting
+                        jiffies_to_msecs(jiffies-alloc_start), order);
-                 * for somebody to do a work for us.
+                stall_timeout += 10 * HZ;
-                 */
-                if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
-                        cond_resched();
-                        goto retry;
-                }
-                goto nopage;
        }
-        /* Avoid allocations with no watermarks from looping endlessly */
+        /* Avoid recursion of direct reclaim */
-        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
                                                        &did_some_progress);
@@ -3702,14 +3799,6 @@ retry:
        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
                goto nopage;
-        /* Make sure we know about allocations which stall for too long */
-        if (time_after(jiffies, alloc_start + stall_timeout)) {
-                warn_alloc(gfp_mask,
-                        "page allocation stalls for %ums, order:%u",
-                        jiffies_to_msecs(jiffies-alloc_start), order);
-                stall_timeout += 10 * HZ;
-        }
        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
                                 did_some_progress > 0, &no_progress_loops))
                goto retry;
@@ -3738,6 +3827,10 @@ retry:
        if (page)
                goto got_pg;
+        /* Avoid allocations with no watermarks from looping endlessly */
+        if (test_thread_flag(TIF_MEMDIE))
+                goto nopage;
        /* Retry as long as the OOM killer is making progress */
        if (did_some_progress) {
                no_progress_loops = 0;
@@ -3755,82 +3848,123 @@ nopage:
        if (read_mems_allowed_retry(cpuset_mems_cookie))
                goto retry_cpuset;
-        warn_alloc(gfp_mask,
+        /*
+         * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
+         * we always retry
+         */
+        if (gfp_mask & __GFP_NOFAIL) {
+                /*
+                 * All existing users of the __GFP_NOFAIL are blockable, so warn
+                 * of any new users that actually require GFP_NOWAIT
+                 */
+                if (WARN_ON_ONCE(!can_direct_reclaim))
+                        goto fail;
+                /*
+                 * PF_MEMALLOC request from this context is rather bizarre
+                 * because we cannot reclaim anything and only can loop waiting
+                 * for somebody to do a work for us
+                 */
+                WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+                /*
+                 * non failing costly orders are a hard requirement which we
+                 * are not prepared for much so let's warn about these users
+                 * so that we can identify them and convert them to something
+                 * else.
+                 */
+                WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
+                /*
+                 * Help non-failing allocations by giving them access to memory
+                 * reserves but do not use ALLOC_NO_WATERMARKS because this
+                 * could deplete whole memory reserves which would just make
+                 * the situation worse
+                 */
+                page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
+                if (page)
+                        goto got_pg;
+                cond_resched();
+                goto retry;
+        }
+fail:
+        warn_alloc(gfp_mask, ac->nodemask,
                        "page allocation failure: order:%u", order);
 got_pg:
        return page;
 }
-/*
+static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
- * This is the 'heart' of the zoned buddy allocator.
+                struct zonelist *zonelist, nodemask_t *nodemask,
- */
+                struct alloc_context *ac, gfp_t *alloc_mask,
-struct page *
+                unsigned int *alloc_flags)
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
-        struct page *page;
+        ac->high_zoneidx = gfp_zone(gfp_mask);
-        unsigned int alloc_flags = ALLOC_WMARK_LOW;
+        ac->zonelist = zonelist;
-        gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+        ac->nodemask = nodemask;
-        struct alloc_context ac = {
+        ac->migratetype = gfpflags_to_migratetype(gfp_mask);
-                .high_zoneidx = gfp_zone(gfp_mask),
-                .zonelist = zonelist,
-                .nodemask = nodemask,
-                .migratetype = gfpflags_to_migratetype(gfp_mask),
-        };
        if (cpusets_enabled()) {
-                alloc_mask |= __GFP_HARDWALL;
+                *alloc_mask |= __GFP_HARDWALL;
-                alloc_flags |= ALLOC_CPUSET;
+                if (!ac->nodemask)
-                if (!ac.nodemask)
+                        ac->nodemask = &cpuset_current_mems_allowed;
-                        ac.nodemask = &cpuset_current_mems_allowed;
+                else
+                        *alloc_flags |= ALLOC_CPUSET;
        }
-        gfp_mask &= gfp_allowed_mask;
        lockdep_trace_alloc(gfp_mask);
        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
        if (should_fail_alloc_page(gfp_mask, order))
-                return NULL;
+                return false;
-        /*
+        if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
-         * Check the zones suitable for the gfp_mask contain at least one
+                *alloc_flags |= ALLOC_CMA;
-         * valid zone. It's possible to have an empty zonelist as a result
-         * of __GFP_THISNODE and a memoryless node
-         */
-        if (unlikely(!zonelist->_zonerefs->zone))
-                return NULL;
-        if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
+        return true;
-                alloc_flags |= ALLOC_CMA;
+}
+/* Determine whether to spread dirty pages and what the first usable zone */
+static inline void finalise_ac(gfp_t gfp_mask,
+                unsigned int order, struct alloc_context *ac)
+{
        /* Dirty zone balancing only done in the fast path */
-        ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+        ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
        /*
         * The preferred zone is used for statistics but crucially it is
         * also used as the starting point for the zonelist iterator. It
         * may get reset for allocations that ignore memory policies.
         */
-        ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+        ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
-                                        ac.high_zoneidx, ac.nodemask);
+                                        ac->high_zoneidx, ac->nodemask);
-        if (!ac.preferred_zoneref->zone) {
+}
-                page = NULL;
-                /*
+/*
-                 * This might be due to race with cpuset_current_mems_allowed
+ * This is the 'heart' of the zoned buddy allocator.
-                 * update, so make sure we retry with original nodemask in the
+ */
-                 * slow path.
+struct page *
-                 */
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                goto no_zone;
+                        struct zonelist *zonelist, nodemask_t *nodemask)
-        }
+{
+        struct page *page;
+        unsigned int alloc_flags = ALLOC_WMARK_LOW;
+        gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+        struct alloc_context ac = { };
+        gfp_mask &= gfp_allowed_mask;
+        if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags))
+                return NULL;
+        finalise_ac(gfp_mask, order, &ac);
        /* First allocation attempt */
        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
        if (likely(page))
                goto out;
-no_zone:
        /*
         * Runtime PM, block IO and its error handling path can deadlock
         * because I/O on the device might not complete.
@@ -4252,20 +4386,20 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 * Determine whether the node should be displayed or not, depending on whether
 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
 */
-bool skip_free_areas_node(unsigned int flags, int nid)
+static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
 {
-        bool ret = false;
-        unsigned int cpuset_mems_cookie;
        if (!(flags & SHOW_MEM_FILTER_NODES))
-                goto out;
+                return false;
-        do {
+        /*
-                cpuset_mems_cookie = read_mems_allowed_begin();
+         * no node mask - aka implicit memory numa policy. Do not bother with
-                ret = !node_isset(nid, cpuset_current_mems_allowed);
+         * the synchronization - read_mems_allowed_begin - because we do not
-        } while (read_mems_allowed_retry(cpuset_mems_cookie));
+         * have to be precise here.
-out:
+         */
-        return ret;
+        if (!nodemask)
+                nodemask = &cpuset_current_mems_allowed;
+        return !node_isset(nid, *nodemask);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -4306,7 +4440,7 @@ static void show_migration_types(unsigned char type)
 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
 *   cpuset.
 */
-void show_free_areas(unsigned int filter)
+void show_free_areas(unsigned int filter, nodemask_t *nodemask)
 {
        unsigned long free_pcp = 0;
        int cpu;
@@ -4314,7 +4448,7 @@ void show_free_areas(unsigned int filter)
        pg_data_t *pgdat;
        for_each_populated_zone(zone) {
-                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                        continue;
                for_each_online_cpu(cpu)
@@ -4348,6 +4482,9 @@ void show_free_areas(unsigned int filter)
                global_page_state(NR_FREE_CMA_PAGES));
        for_each_online_pgdat(pgdat) {
+                if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
+                        continue;
                printk("Node %d"
                        " active_anon:%lukB"
                        " inactive_anon:%lukB"
@@ -4397,7 +4534,7 @@ void show_free_areas(unsigned int filter)
        for_each_populated_zone(zone) {
                int i;
-                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                        continue;
                free_pcp = 0;
@@ -4462,7 +4599,7 @@ void show_free_areas(unsigned int filter)
                unsigned long nr[MAX_ORDER], flags, total = 0;
                unsigned char types[MAX_ORDER];
-                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
                        continue;
                show_node(zone);
                printk(KERN_CONT "%s: ", zone->name);
@@ -5083,8 +5220,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                if (context != MEMMAP_EARLY)
                        goto not_early;
-                if (!early_pfn_valid(pfn))
+                if (!early_pfn_valid(pfn)) {
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+                        /*
+                         * Skip to the pfn preceding the next valid one (or
+                         * end_pfn), such that we hit a valid pfn (or end_pfn)
+                         * on our next iteration of the loop.
+                         */
+                        pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
+#endif
                        continue;
+                }
                if (!early_pfn_in_nid(pfn, nid))
                        continue;
                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
@@ -5780,7 +5926,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
         * the zone and SPARSEMEM is in use. If there are holes within the
         * zone, each populated memory region may cost us one or two extra
         * memmap pages due to alignment because memmap pages for each
-         * populated regions may not naturally algined on page boundary.
+         * populated regions may not be naturally aligned on page boundary.
         * So the (present_pages >> 4) heuristic is a tradeoff for that.
         */
        if (spanned_pages > present_pages + (present_pages >> 4) &&
@@ -6344,8 +6490,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                start_pfn = end_pfn;
        }
-        arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
-        arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
@@ -7081,8 +7225,9 @@ void *__init alloc_large_system_hash(const char *tablename,
 * If @count is not zero, it is okay to include less @count unmovable pages
 *
 * PageLRU check without isolation or lru_lock could race so that
- * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
- * expect this function should be exact.
+ * check without lock_page also may miss some movable non-lru pages at
+ * race condition. So you can't expect this function should be exact.
 */
 bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                         bool skip_hwpoisoned_pages)
@@ -7138,6 +7283,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                if (skip_hwpoisoned_pages && PageHWPoison(page))
                        continue;
+                if (__PageMovable(page))
+                        continue;
                if (!PageLRU(page))
                        found++;
                /*
@@ -7249,6 +7397,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
 *                      in range must have the same migratetype and it must
 *                      be either of the two.
+ * @gfp_mask:   GFP mask to use during compaction
 *
 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
 * aligned, however it's the caller's responsibility to guarantee that
@@ -7262,7 +7411,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
 * need to be freed with free_contig_range().
 */
 int alloc_contig_range(unsigned long start, unsigned long end,
-                       unsigned migratetype)
+                       unsigned migratetype, gfp_t gfp_mask)
 {
        unsigned long outer_start, outer_end;
        unsigned int order;
@@ -7274,7 +7423,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                .zone = page_zone(pfn_to_page(start)),
                .mode = MIGRATE_SYNC,
                .ignore_skip_hint = true,
-                .gfp_mask = GFP_KERNEL,
+                .gfp_mask = memalloc_noio_flags(gfp_mask),
        };
        INIT_LIST_HEAD(&cc.migratepages);