1 files changed, 417 insertions, 116 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 838ca8bb64f7..59de90d5d3a3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = {
 #endif
 };
+char * const migratetype_names[MIGRATE_TYPES] = {
+        "Unmovable",
+        "Movable",
+        "Reclaimable",
+        "HighAtomic",
+#ifdef CONFIG_CMA
+        "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+        "Isolate",
+#endif
+};
 compound_page_dtor * const compound_page_dtors[] = {
        NULL,
        free_compound_page,
@@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = {
 int min_free_kbytes = 1024;
 int user_min_free_kbytes = -1;
+int watermark_scale_factor = 10;
 static unsigned long __meminitdata nr_kernel_pages;
 static unsigned long __meminitdata nr_all_pages;
@@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+static bool mirrored_kernelcore;
 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
@@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat,
                                unsigned long pfn, unsigned long zone_end,
                                unsigned long *nr_initialised)
 {
+        unsigned long max_initialise;
        /* Always populate low zones for address-contrained allocations */
        if (zone_end < pgdat_end_pfn(pgdat))
                return true;
+        /*
+         * Initialise at least 2G of a node but also take into account that
+         * two large system hashes that can take up 1GB for 0.25TB/node.
+         */
+        max_initialise = max(2UL << (30 - PAGE_SHIFT),
+                (pgdat->node_spanned_pages >> 8));
-        /* Initialise at least 2G of the highest zone */
        (*nr_initialised)++;
-        if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+        if ((*nr_initialised > max_initialise) &&
            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
                pgdat->first_deferred_pfn = pfn;
                return false;
@@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason,
                        goto out;
                }
                if (nr_unshown) {
-                        printk(KERN_ALERT
+                        pr_alert(
                              "BUG: Bad page state: %lu messages suppressed\n",
                                nr_unshown);
                        nr_unshown = 0;
@@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason,
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;
-        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
+        pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-        dump_page_badflags(page, reason, bad_flags);
+        __dump_page(page, reason);
+        bad_flags &= page->flags;
+        if (bad_flags)
+                pr_alert("bad because of flags: %#lx(%pGp)\n",
+                                                bad_flags, &bad_flags);
+        dump_page_owner(page);
        print_modules();
        dump_stack();
@@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly;
+bool _debug_pagealloc_enabled __read_mostly
+                        = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled);
 bool _debug_guardpage_enabled __read_mostly;
 static int __init early_debug_pagealloc(char *buf)
@@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf)
        if (strcmp(buf, "on") == 0)
                _debug_pagealloc_enabled = true;
+        if (strcmp(buf, "off") == 0)
+                _debug_pagealloc_enabled = false;
        return 0;
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
@@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf)
        unsigned long res;
        if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
-                printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+                pr_err("Bad debug_guardpage_minorder value\n");
                return 0;
        }
        _debug_guardpage_minorder = res;
-        printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+        pr_info("Setting debug_guardpage_minorder to %lu\n", res);
        return 0;
 }
 __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
@@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page,
        unsigned long combined_idx;
        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
-        unsigned int max_order = MAX_ORDER;
+        unsigned int max_order;
+        max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
        VM_BUG_ON(!zone_is_initialized(zone));
        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
        VM_BUG_ON(migratetype == -1);
-        if (is_migrate_isolate(migratetype)) {
+        if (likely(!is_migrate_isolate(migratetype)))
-                /*
-                 * We restrict max order of merging to prevent merge
-                 * between freepages on isolate pageblock and normal
-                 * pageblock. Without this, pageblock isolation
-                 * could cause incorrect freepage accounting.
-                 */
-                max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
-        } else {
                __mod_zone_freepage_state(zone, 1 << order, migratetype);
-        }
-        page_idx = pfn & ((1 << max_order) - 1);
+        page_idx = pfn & ((1 << MAX_ORDER) - 1);
        VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page);
        VM_BUG_ON_PAGE(bad_range(zone, page), page);
+continue_merging:
        while (order < max_order - 1) {
                buddy_idx = __find_buddy_index(page_idx, order);
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
-                        break;
+                        goto done_merging;
                /*
                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
                 * merge with it and move up one order.
@@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page,
                page_idx = combined_idx;
                order++;
        }
+        if (max_order < MAX_ORDER) {
+                /* If we are here, it means order is >= pageblock_order.
+                 * We want to prevent merge between freepages on isolate
+                 * pageblock and normal pageblock. Without this, pageblock
+                 * isolation could cause incorrect freepage or CMA accounting.
+                 *
+                 * We don't want to hit this code for the more frequent
+                 * low-order merging.
+                 */
+                if (unlikely(has_isolate_pageblock(zone))) {
+                        int buddy_mt;
+                        buddy_idx = __find_buddy_index(page_idx, order);
+                        buddy = page + (buddy_idx - page_idx);
+                        buddy_mt = get_pageblock_migratetype(buddy);
+                        if (migratetype != buddy_mt
+                                        && (is_migrate_isolate(migratetype) ||
+                                                is_migrate_isolate(buddy_mt)))
+                                goto done_merging;
+                }
+                max_order++;
+                goto continue_merging;
+        }
+done_merging:
        set_page_order(page, order);
        /*
@@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page)
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
-        if (unlikely(atomic_read(&page->_count) != 0))
+        if (unlikely(page_ref_count(page) != 0))
                bad_reason = "nonzero _count";
        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
@@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
                                           PAGE_SIZE << order);
        }
        arch_free_page(page, order);
+        kernel_poison_pages(page, 1 << order, 0);
        kernel_map_pages(page, 1 << order, 0);
        return true;
@@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
        return __free_pages_boot_core(page, pfn, order);
 }
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
+                                     unsigned long end_pfn, struct zone *zone)
+{
+        struct page *start_page;
+        struct page *end_page;
+        /* end_pfn is one past the range we are checking */
+        end_pfn--;
+        if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+                return NULL;
+        start_page = pfn_to_page(start_pfn);
+        if (page_zone(start_page) != zone)
+                return NULL;
+        end_page = pfn_to_page(end_pfn);
+        /* This gives a shorter code than deriving page_zone(end_page) */
+        if (page_zone_id(start_page) != page_zone_id(end_page))
+                return NULL;
+        return start_page;
+}
+void set_zone_contiguous(struct zone *zone)
+{
+        unsigned long block_start_pfn = zone->zone_start_pfn;
+        unsigned long block_end_pfn;
+        block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
+        for (; block_start_pfn < zone_end_pfn(zone);
+                        block_start_pfn = block_end_pfn,
+                         block_end_pfn += pageblock_nr_pages) {
+                block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
+                if (!__pageblock_pfn_to_page(block_start_pfn,
+                                             block_end_pfn, zone))
+                        return;
+        }
+        /* We confirm that there is no hole */
+        zone->contiguous = true;
+}
+void clear_zone_contiguous(struct zone *zone)
+{
+        zone->contiguous = false;
+}
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 static void __init deferred_free_range(struct page *page,
                                        unsigned long pfn, int nr_pages)
@@ -1254,9 +1376,13 @@ free_range:
        pgdat_init_report_one_done();
        return 0;
 }
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 void __init page_alloc_init_late(void)
 {
+        struct zone *zone;
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        int nid;
        /* There will be num_node_state(N_MEMORY) threads */
@@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void)
        /* Reinit limits that are based on free pages after the kernel is up */
        files_maxfiles_init();
+#endif
+        for_each_populated_zone(zone)
+                set_zone_contiguous(zone);
 }
-#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 #ifdef CONFIG_CMA
 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
@@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page)
                bad_reason = "nonzero mapcount";
        if (unlikely(page->mapping != NULL))
                bad_reason = "non-NULL mapping";
-        if (unlikely(atomic_read(&page->_count) != 0))
+        if (unlikely(page_ref_count(page) != 0))
                bad_reason = "nonzero _count";
        if (unlikely(page->flags & __PG_HWPOISON)) {
                bad_reason = "HWPoisoned (hardware-corrupted)";
@@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page)
        return 0;
 }
+static inline bool free_pages_prezeroed(bool poisoned)
+{
+        return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+                page_poisoning_enabled() && poisoned;
+}
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
                                                                int alloc_flags)
 {
        int i;
+        bool poisoned = true;
        for (i = 0; i < (1 << order); i++) {
                struct page *p = page + i;
                if (unlikely(check_new_page(p)))
                        return 1;
+                if (poisoned)
+                        poisoned &= page_is_poisoned(p);
        }
        set_page_private(page, 0);
@@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
        arch_alloc_page(page, order);
        kernel_map_pages(page, 1 << order, 1);
+        kernel_poison_pages(page, 1 << order, 1);
        kasan_alloc_pages(page, order);
-        if (gfp_flags & __GFP_ZERO)
+        if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
                for (i = 0; i < (1 << order); i++)
                        clear_highpage(page + i);
@@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                list_del(&page->lru);
                pcp->count--;
        } else {
-                if (unlikely(gfp_flags & __GFP_NOFAIL)) {
+                /*
-                        /*
+                 * We most definitely don't want callers attempting to
-                         * __GFP_NOFAIL is not to be used in new code.
+                 * allocate greater than order-1 page units with __GFP_NOFAIL.
-                         *
+                 */
-                         * All __GFP_NOFAIL callers should be fixed so that they
+                WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-                         * properly detect and handle allocation failures.
-                         *
-                         * We most definitely don't want callers attempting to
-                         * allocate greater than order-1 page units with
-                         * __GFP_NOFAIL.
-                         */
-                        WARN_ON_ONCE(order > 1);
-                }
                spin_lock_irqsave(&zone->lock, flags);
                page = NULL;
@@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
                va_end(args);
        }
-        pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
+        pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n",
-                current->comm, order, gfp_mask);
+                current->comm, order, gfp_mask, &gfp_mask);
        dump_stack();
        if (!should_suppress_show_mem())
                show_mem(filter);
@@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         * XXX: Page reclaim didn't yield anything,
                         * and the OOM killer can't be invoked, but
                         * keep looping as per tradition.
+                         *
+                         * But do not keep looping if oom_killer_disable()
+                         * was already called, for the system is trying to
+                         * enter a quiescent state during suspend.
                         */
-                        *did_some_progress = 1;
+                        *did_some_progress = !oom_killer_disabled;
                        goto out;
                }
                if (pm_suspended_storage())
@@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
                gfp_mask &= ~__GFP_ATOMIC;
-        /*
-         * If this allocation cannot block and it is for a specific node, then
-         * fail early.  There's no need to wakeup kswapd or retry for a
-         * speculative node-specific allocation.
-         */
-        if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
-                goto nopage;
 retry:
        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                wake_all_kswapds(order, ac);
@@ -3372,7 +3498,7 @@ refill:
                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
-                atomic_add(size - 1, &page->_count);
+                page_ref_add(page, size - 1);
                /* reset page count bias and offset to start of new frag */
                nc->pfmemalloc = page_is_pfmemalloc(page);
@@ -3384,7 +3510,7 @@ refill:
        if (unlikely(offset < 0)) {
                page = virt_to_page(nc->va);
-                if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+                if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
                        goto refill;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
@@ -3392,7 +3518,7 @@ refill:
                size = nc->size;
 #endif
                /* OK, page count is 0, we can safely set it */
-                atomic_set(&page->_count, size);
+                set_page_count(page, size);
                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = size;
@@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone)
                printk("Node %d ", zone_to_nid(zone));
 }
+long si_mem_available(void)
+{
+        long available;
+        unsigned long pagecache;
+        unsigned long wmark_low = 0;
+        unsigned long pages[NR_LRU_LISTS];
+        struct zone *zone;
+        int lru;
+        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
+                pages[lru] = global_page_state(NR_LRU_BASE + lru);
+        for_each_zone(zone)
+                wmark_low += zone->watermark[WMARK_LOW];
+        /*
+         * Estimate the amount of memory available for userspace allocations,
+         * without causing swapping.
+         */
+        available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+        /*
+         * Not all the page cache can be freed, otherwise the system will
+         * start swapping. Assume at least half of the page cache, or the
+         * low watermark worth of cache, needs to stay.
+         */
+        pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+        pagecache -= min(pagecache / 2, wmark_low);
+        available += pagecache;
+        /*
+         * Part of the reclaimable slab consists of items that are in use,
+         * and cannot be freed. Cap this estimate at the low watermark.
+         */
+        available += global_page_state(NR_SLAB_RECLAIMABLE) -
+                     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+        if (available < 0)
+                available = 0;
+        return available;
+}
+EXPORT_SYMBOL_GPL(si_mem_available);
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
@@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s)
        } else if (*s == 'z' || *s == 'Z') {
                user_zonelist_order = ZONELIST_ORDER_ZONE;
        } else {
-                printk(KERN_WARNING
+                pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
-                        "Ignoring invalid numa_zonelist_order value:  "
-                        "%s\n", s);
                return -EINVAL;
        }
        return 0;
@@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
        else
                page_group_by_mobility_disabled = 0;
-        pr_info("Built %i zonelists in %s order, mobility grouping %s.  "
+        pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
-                "Total pages: %ld\n",
+                nr_online_nodes,
-                        nr_online_nodes,
+                zonelist_order_name[current_zonelist_order],
-                        zonelist_order_name[current_zonelist_order],
+                page_group_by_mobility_disabled ? "off" : "on",
-                        page_group_by_mobility_disabled ? "off" : "on",
+                vm_total_pages);
-                        vm_total_pages);
 #ifdef CONFIG_NUMA
        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
 #endif
@@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        pg_data_t *pgdat = NODE_DATA(nid);
        unsigned long pfn;
        unsigned long nr_initialised = 0;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+        struct memblock_region *r = NULL, *tmp;
+#endif
        if (highest_memmap_pfn < end_pfn - 1)
                highest_memmap_pfn = end_pfn - 1;
@@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                /*
-                 * There can be holes in boot-time mem_map[]s
+                 * There can be holes in boot-time mem_map[]s handed to this
-                 * handed to this function.  They do not
+                 * function.  They do not exist on hotplugged memory.
-                 * exist on hotplugged memory.
+                 */
+                if (context != MEMMAP_EARLY)
+                        goto not_early;
+                if (!early_pfn_valid(pfn))
+                        continue;
+                if (!early_pfn_in_nid(pfn, nid))
+                        continue;
+                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
+                        break;
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+                /*
+                 * If not mirrored_kernelcore and ZONE_MOVABLE exists, range
+                 * from zone_movable_pfn[nid] to end of each node should be
+                 * ZONE_MOVABLE not ZONE_NORMAL. skip it.
                 */
-                if (context == MEMMAP_EARLY) {
+                if (!mirrored_kernelcore && zone_movable_pfn[nid])
-                        if (!early_pfn_valid(pfn))
+                        if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid])
                                continue;
-                        if (!early_pfn_in_nid(pfn, nid))
+                /*
+                 * Check given memblock attribute by firmware which can affect
+                 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
+                 * mirrored, it's an overlapped memmap init. skip it.
+                 */
+                if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+                        if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
+                                for_each_memblock(memory, tmp)
+                                        if (pfn < memblock_region_memory_end_pfn(tmp))
+                                                break;
+                                r = tmp;
+                        }
+                        if (pfn >= memblock_region_memory_base_pfn(r) &&
+                            memblock_is_mirror(r)) {
+                                /* already initialized as NORMAL */
+                                pfn = memblock_region_memory_end_pfn(r);
                                continue;
-                        if (!update_defer_init(pgdat, pfn, end_pfn,
+                        }
-                                                &nr_initialised))
-                                break;
                }
+#endif
+not_early:
                /*
                 * Mark the block movable so that blocks are reserved for
                 * movable at startup. This will force kernel allocations
@@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
                        *zone_end_pfn = min(node_end_pfn,
                                arch_zone_highest_possible_pfn[movable_zone]);
-                /* Adjust for ZONE_MOVABLE starting within this range */
-                } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
-                                *zone_end_pfn > zone_movable_pfn[nid]) {
-                        *zone_end_pfn = zone_movable_pfn[nid];
                /* Check if this whole range is within ZONE_MOVABLE */
                } else if (*zone_start_pfn >= zone_movable_pfn[nid])
                        *zone_start_pfn = *zone_end_pfn;
@@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                        unsigned long *zone_start_pfn,
+                                        unsigned long *zone_end_pfn,
                                        unsigned long *ignored)
 {
-        unsigned long zone_start_pfn, zone_end_pfn;
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
                return 0;
        /* Get the start and end of the zone */
-        zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
+        *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
-        zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+        *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
        adjust_zone_range_for_zone_movable(nid, zone_type,
                                node_start_pfn, node_end_pfn,
-                                &zone_start_pfn, &zone_end_pfn);
+                                zone_start_pfn, zone_end_pfn);
        /* Check that this node has pages within the zone's required range */
-        if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
+        if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
                return 0;
        /* Move the zone boundaries inside the node if necessary */
-        zone_end_pfn = min(zone_end_pfn, node_end_pfn);
+        *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
-        zone_start_pfn = max(zone_start_pfn, node_start_pfn);
+        *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
        /* Return the spanned pages */
-        return zone_end_pfn - zone_start_pfn;
+        return *zone_end_pfn - *zone_start_pfn;
 }
 /*
@@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
        unsigned long zone_start_pfn, zone_end_pfn;
+        unsigned long nr_absent;
        /* When hotadd a new node from cpu_up(), the node should be empty */
        if (!node_start_pfn && !node_end_pfn)
@@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
        adjust_zone_range_for_zone_movable(nid, zone_type,
                        node_start_pfn, node_end_pfn,
                        &zone_start_pfn, &zone_end_pfn);
-        return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+        nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
+        /*
+         * ZONE_MOVABLE handling.
+         * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
+         * and vice versa.
+         */
+        if (zone_movable_pfn[nid]) {
+                if (mirrored_kernelcore) {
+                        unsigned long start_pfn, end_pfn;
+                        struct memblock_region *r;
+                        for_each_memblock(memory, r) {
+                                start_pfn = clamp(memblock_region_memory_base_pfn(r),
+                                                  zone_start_pfn, zone_end_pfn);
+                                end_pfn = clamp(memblock_region_memory_end_pfn(r),
+                                                zone_start_pfn, zone_end_pfn);
+                                if (zone_type == ZONE_MOVABLE &&
+                                    memblock_is_mirror(r))
+                                        nr_absent += end_pfn - start_pfn;
+                                if (zone_type == ZONE_NORMAL &&
+                                    !memblock_is_mirror(r))
+                                        nr_absent += end_pfn - start_pfn;
+                        }
+                } else {
+                        if (zone_type == ZONE_NORMAL)
+                                nr_absent += node_end_pfn - zone_movable_pfn[nid];
+                }
+        }
+        return nr_absent;
 }
 #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
@@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
+                                        unsigned long *zone_start_pfn,
+                                        unsigned long *zone_end_pfn,
                                        unsigned long *zones_size)
 {
+        unsigned int zone;
+        *zone_start_pfn = node_start_pfn;
+        for (zone = 0; zone < zone_type; zone++)
+                *zone_start_pfn += zones_size[zone];
+        *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
        return zones_size[zone_type];
 }
@@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
        for (i = 0; i < MAX_NR_ZONES; i++) {
                struct zone *zone = pgdat->node_zones + i;
+                unsigned long zone_start_pfn, zone_end_pfn;
                unsigned long size, real_size;
                size = zone_spanned_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn,
                                                  node_end_pfn,
+                                                  &zone_start_pfn,
+                                                  &zone_end_pfn,
                                                  zones_size);
                real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                  node_start_pfn, node_end_pfn,
                                                  zholes_size);
+                if (size)
+                        zone->zone_start_pfn = zone_start_pfn;
+                else
+                        zone->zone_start_pfn = 0;
                zone->spanned_pages = size;
                zone->present_pages = real_size;
@@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 {
        enum zone_type j;
        int nid = pgdat->node_id;
-        unsigned long zone_start_pfn = pgdat->node_start_pfn;
        int ret;
        pgdat_resize_init(pgdat);
@@ -5217,11 +5461,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
 #endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
+#ifdef CONFIG_COMPACTION
+        init_waitqueue_head(&pgdat->kcompactd_wait);
+#endif
        pgdat_page_ext_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, freesize, memmap_pages;
+                unsigned long zone_start_pfn = zone->zone_start_pfn;
                size = zone->spanned_pages;
                realsize = freesize = zone->present_pages;
@@ -5240,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                                               "  %s zone: %lu pages used for memmap\n",
                                               zone_names[j], memmap_pages);
                        } else
-                                printk(KERN_WARNING
+                                pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
-                                        "  %s zone: %lu pages exceeds freesize %lu\n",
                                        zone_names[j], memmap_pages, freesize);
                }
@@ -5290,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                BUG_ON(ret);
                memmap_init(size, nid, j, zone_start_pfn);
-                zone_start_pfn += size;
        }
 }
@@ -5358,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
                (u64)start_pfn << PAGE_SHIFT,
                end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
+#else
+        start_pfn = node_start_pfn;
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5448,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
                min_pfn = min(min_pfn, start_pfn);
        if (min_pfn == ULONG_MAX) {
-                printk(KERN_WARNING
+                pr_warn("Could not find start_pfn for node %d\n", nid);
-                        "Could not find start_pfn for node %d\n", nid);
                return 0;
        }
@@ -5529,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        }
        /*
+         * If kernelcore=mirror is specified, ignore movablecore option
+         */
+        if (mirrored_kernelcore) {
+                bool mem_below_4gb_not_mirrored = false;
+                for_each_memblock(memory, r) {
+                        if (memblock_is_mirror(r))
+                                continue;
+                        nid = r->nid;
+                        usable_startpfn = memblock_region_memory_base_pfn(r);
+                        if (usable_startpfn < 0x100000) {
+                                mem_below_4gb_not_mirrored = true;
+                                continue;
+                        }
+                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
+                                min(usable_startpfn, zone_movable_pfn[nid]) :
+                                usable_startpfn;
+                }
+                if (mem_below_4gb_not_mirrored)
+                        pr_warn("This configuration results in unmirrored kernel memory.");
+                goto out2;
+        }
+        /*
         * If movablecore=nn[KMG] was specified, calculate what size of
         * kernelcore that corresponds so that memory usable for
         * any allocation type is evenly spread. If both kernelcore
@@ -5788,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core)
 */
 static int __init cmdline_parse_kernelcore(char *p)
 {
+        /* parse kernelcore=mirror */
+        if (parse_option_str(p, "mirror")) {
+                mirrored_kernelcore = true;
+                return 0;
+        }
        return cmdline_parse_core(p, &required_kernelcore);
 }
@@ -5885,22 +6168,21 @@ void __init mem_init_print_info(const char *str)
 #undef  adj_init_size
-        pr_info("Memory: %luK/%luK available "
+        pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
-               "(%luK kernel code, %luK rwdata, %luK rodata, "
-               "%luK init, %luK bss, %luK reserved, %luK cma-reserved"
 #ifdef  CONFIG_HIGHMEM
-               ", %luK highmem"
+                ", %luK highmem"
 #endif
-               "%s%s)\n",
+                "%s%s)\n",
-               nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+                nr_free_pages() << (PAGE_SHIFT - 10),
-               codesize >> 10, datasize >> 10, rosize >> 10,
+                physpages << (PAGE_SHIFT - 10),
-               (init_data_size + init_code_size) >> 10, bss_size >> 10,
+                codesize >> 10, datasize >> 10, rosize >> 10,
-               (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10),
+                (init_data_size + init_code_size) >> 10, bss_size >> 10,
-               totalcma_pages << (PAGE_SHIFT-10),
+                (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+                totalcma_pages << (PAGE_SHIFT - 10),
 #ifdef  CONFIG_HIGHMEM
-               totalhigh_pages << (PAGE_SHIFT-10),
+                totalhigh_pages << (PAGE_SHIFT - 10),
 #endif
-               str ? ", " : "", str ? str : "");
+                str ? ", " : "", str ? str : "");
 }
 /**
@@ -6075,8 +6357,17 @@ static void __setup_per_zone_wmarks(void)
                        zone->watermark[WMARK_MIN] = tmp;
                }
-                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
+                /*
-                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+                 * Set the kswapd watermarks distance according to the
+                 * scale factor in proportion to available memory, but
+                 * ensure a minimum size on small systems.
+                 */
+                tmp = max_t(u64, tmp >> 2,
+                            mult_frac(zone->managed_pages,
+                                      watermark_scale_factor, 10000));
+                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
+                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
                        high_wmark_pages(zone) - low_wmark_pages(zone) -
@@ -6217,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
        return 0;
 }
+int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
+        void __user *buffer, size_t *length, loff_t *ppos)
+{
+        int rc;
+        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+        if (rc)
+                return rc;
+        if (write)
+                setup_per_zone_wmarks();
+        return 0;
+}
 #ifdef CONFIG_NUMA
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
@@ -6408,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename,
        if (!table)
                panic("Failed to allocate %s hash table\n", tablename);
-        printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
+        pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
-               tablename,
+                tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
-               (1UL << log2qty),
-               ilog2(size) - PAGE_SHIFT,
-               size);
        if (_hash_shift)
                *_hash_shift = log2qty;
@@ -6563,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
                 * This check already skips compound tails of THP
                 * because their page->_count is zero at all time.
                 */
-                if (!atomic_read(&page->_count)) {
+                if (!page_ref_count(page)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
@@ -6913,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
 #ifdef CONFIG_DEBUG_VM
-                printk(KERN_INFO "remove from free list %lx %d %lx\n",
+                pr_info("remove from free list %lx %d %lx\n",
-                       pfn, 1 << order, end_pfn);
+                        pfn, 1 << order, end_pfn);
 #endif
                list_del(&page->lru);
                rmv_page_order(page);
@@ -6927,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 }
 #endif
-#ifdef CONFIG_MEMORY_FAILURE
 bool is_free_buddy_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
@@ -6946,4 +7248,3 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
-#endif