1 files changed, 225 insertions, 129 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..4ba5e37127fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 #ifdef CONFIG_HIGHMEM
        [N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+        [N_MEMORY] = { { [0] = 1UL } },
+#endif
        [N_CPU] = { { [0] = 1UL } },
 #endif  /* NUMA */
 };
@@ -368,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int nr_pages = 1 << order;
        int bad = 0;
-        if (unlikely(compound_order(page) != order) ||
+        if (unlikely(compound_order(page) != order)) {
-            unlikely(!PageHead(page))) {
                bad_page(page);
                bad++;
        }
@@ -523,7 +525,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * If a block is freed, and its buddy is also free, then this
 * triggers coalescing into a block of larger size.
 *
- * -- wli
+ * -- nyc
 */
 static inline void __free_one_page(struct page *page,
@@ -608,6 +610,7 @@ static inline int free_pages_check(struct page *page)
                bad_page(page);
                return 1;
        }
+        reset_page_last_nid(page);
        if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
                page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
        return 0;
@@ -667,11 +670,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                        __free_one_page(page, zone, 0, mt);
                        trace_mm_page_pcpu_drain(page, 0, mt);
-                        if (is_migrate_cma(mt))
+                        if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
-                                __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+                                __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
+                                if (is_migrate_cma(mt))
+                                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
+                        }
                } while (--to_free && --batch_free && !list_empty(list));
        }
-        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
        spin_unlock(&zone->lock);
 }
@@ -730,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
+/*
+ * Read access to zone->managed_pages is safe because it's unsigned long,
+ * but we still need to serialize writers. Currently all callers of
+ * __free_pages_bootmem() except put_page_bootmem() should only be used
+ * at boot time. So for shorter boot time, we shift the burden to
+ * put_page_bootmem() to serialize writers.
+ */
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
@@ -745,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
                set_page_count(p, 0);
        }
+        page_zone(page)->managed_pages += 1 << order;
        set_page_refcounted(page);
        __free_pages(page, order);
 }
@@ -780,7 +793,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
- * -- wli
+ * -- nyc
 */
 static inline void expand(struct zone *zone, struct page *page,
        int low, int high, struct free_area *area,
@@ -1392,21 +1405,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
        zone = page_zone(page);
        order = page_order(page);
+        mt = get_pageblock_migratetype(page);
-        /* Obey watermarks as if the page was being allocated */
+        if (mt != MIGRATE_ISOLATE) {
-        watermark = low_wmark_pages(zone) + (1 << order);
+                /* Obey watermarks as if the page was being allocated */
-        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                watermark = low_wmark_pages(zone) + (1 << order);
-                return 0;
+                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                        return 0;
+                __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
+        }
        /* Remove page from free list */
        list_del(&page->lru);
        zone->free_area[order].nr_free--;
        rmv_page_order(page);
-        mt = get_pageblock_migratetype(page);
-        if (unlikely(mt != MIGRATE_ISOLATE))
-                __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
        if (alloc_order != order)
                expand(zone, page, alloc_order, order,
                        &zone->free_area[order], migratetype);
@@ -1692,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
 *
 * If the zonelist cache is present in the passed in zonelist, then
 * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ * tasks mems_allowed, or node_states[N_MEMORY].)
 *
 * If the zonelist cache is not available for this zonelist, does
 * nothing and returns NULL.
@@ -1721,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
                                        &cpuset_current_mems_allowed :
-                                        &node_states[N_HIGH_MEMORY];
+                                        &node_states[N_MEMORY];
        return allowednodes;
 }
@@ -1871,7 +1885,7 @@ zonelist_scan:
         */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                high_zoneidx, nodemask) {
-                if (NUMA_BUILD && zlc_active &&
+                if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
                if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1931,8 @@ zonelist_scan:
                                    classzone_idx, alloc_flags))
                                goto try_this_zone;
-                        if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
+                        if (IS_ENABLED(CONFIG_NUMA) &&
+                                        !did_zlc_setup && nr_online_nodes > 1) {
                                /*
                                 * we do zlc_setup if there are multiple nodes
                                 * and before considering the first zone allowed
@@ -1936,7 +1951,7 @@ zonelist_scan:
                         * As we may have just activated ZLC, check if the first
                         * eligible zone has failed zone_reclaim recently.
                         */
-                        if (NUMA_BUILD && zlc_active &&
+                        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                                !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
@@ -1962,11 +1977,11 @@ try_this_zone:
                if (page)
                        break;
 this_zone_full:
-                if (NUMA_BUILD)
+                if (IS_ENABLED(CONFIG_NUMA))
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+        if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
                /* Disable zlc cache for second zonelist scan */
                zlc_active = 0;
                goto zonelist_scan;
@@ -2266,7 +2281,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
                return NULL;
        /* After successful reclaim, reconsider all zones for allocation */
-        if (NUMA_BUILD)
+        if (IS_ENABLED(CONFIG_NUMA))
                zlc_clear_zones_full(zonelist);
 retry:
@@ -2412,7 +2427,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         * allowed per node queues are empty and that nodes are
         * over allocated.
         */
-        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+        if (IS_ENABLED(CONFIG_NUMA) &&
+                        (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
 restart:
@@ -2596,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+        struct mem_cgroup *memcg = NULL;
        gfp_mask &= gfp_allowed_mask;
@@ -2614,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        /*
+         * Will only have any effect when __GFP_KMEMCG is set.  This is
+         * verified in the (always inline) callee
+         */
+        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+                return NULL;
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
@@ -2649,6 +2673,8 @@ out:
        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
                goto retry_cpuset;
+        memcg_kmem_commit_charge(page, memcg, order);
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2701,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+        memcg_kmem_uncharge_pages(page, order);
+        __free_pages(page, order);
+}
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+        if (addr != 0) {
+                VM_BUG_ON(!virt_addr_valid((void *)addr));
+                __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+        }
+}
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
        if (addr) {
@@ -2819,7 +2870,7 @@ unsigned int nr_free_pagecache_pages(void)
 static inline void show_node(struct zone *zone)
 {
-        if (NUMA_BUILD)
+        if (IS_ENABLED(CONFIG_NUMA))
                printk("Node %d ", zone_to_nid(zone));
 }
@@ -2877,6 +2928,31 @@ out:
 #define K(x) ((x) << (PAGE_SHIFT-10))
+static void show_migration_types(unsigned char type)
+{
+        static const char types[MIGRATE_TYPES] = {
+                [MIGRATE_UNMOVABLE]     = 'U',
+                [MIGRATE_RECLAIMABLE]   = 'E',
+                [MIGRATE_MOVABLE]       = 'M',
+                [MIGRATE_RESERVE]       = 'R',
+#ifdef CONFIG_CMA
+                [MIGRATE_CMA]           = 'C',
+#endif
+                [MIGRATE_ISOLATE]       = 'I',
+        };
+        char tmp[MIGRATE_TYPES + 1];
+        char *p = tmp;
+        int i;
+        for (i = 0; i < MIGRATE_TYPES; i++) {
+                if (type & (1 << i))
+                        *p++ = types[i];
+        }
+        *p = '\0';
+        printk("(%s) ", tmp);
+}
 /*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2951,6 +3027,7 @@ void show_free_areas(unsigned int filter)
                        " isolated(anon):%lukB"
                        " isolated(file):%lukB"
                        " present:%lukB"
+                        " managed:%lukB"
                        " mlocked:%lukB"
                        " dirty:%lukB"
                        " writeback:%lukB"
@@ -2980,6 +3057,7 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_ISOLATED_ANON)),
                        K(zone_page_state(zone, NR_ISOLATED_FILE)),
                        K(zone->present_pages),
+                        K(zone->managed_pages),
                        K(zone_page_state(zone, NR_MLOCK)),
                        K(zone_page_state(zone, NR_FILE_DIRTY)),
                        K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3005,6 +3083,7 @@ void show_free_areas(unsigned int filter)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                unsigned char types[MAX_ORDER];
                if (skip_free_areas_node(filter, zone_to_nid(zone)))
                        continue;
@@ -3013,12 +3092,24 @@ void show_free_areas(unsigned int filter)
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
-                        nr[order] = zone->free_area[order].nr_free;
+                        struct free_area *area = &zone->free_area[order];
+                        int type;
+                        nr[order] = area->nr_free;
                        total += nr[order] << order;
+                        types[order] = 0;
+                        for (type = 0; type < MIGRATE_TYPES; type++) {
+                                if (!list_empty(&area->free_list[type]))
+                                        types[order] |= 1 << type;
+                        }
                }
                spin_unlock_irqrestore(&zone->lock, flags);
-                for (order = 0; order < MAX_ORDER; order++)
+                for (order = 0; order < MAX_ORDER; order++) {
                        printk("%lu*%lukB ", nr[order], K(1UL) << order);
+                        if (nr[order])
+                                show_migration_types(types[order]);
+                }
                printk("= %lukB\n", K(total));
        }
@@ -3195,7 +3286,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                return node;
        }
-        for_each_node_state(n, N_HIGH_MEMORY) {
+        for_each_node_state(n, N_MEMORY) {
                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
@@ -3337,7 +3428,7 @@ static int default_zonelist_order(void)
         * local memory, NODE_ORDER may be suitable.
         */
        average_size = total_size /
-                                (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+                                (nodes_weight(node_states[N_MEMORY]) + 1);
        for_each_online_node(nid) {
                low_kmem_size = 0;
                total_size = 0;
@@ -3827,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
+                reset_page_last_nid(page);
                SetPageReserved(page);
                /*
                 * Mark the block movable so that blocks are reserved for
@@ -4433,6 +4525,26 @@ void __init set_pageblock_order(void)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
+                                                   unsigned long present_pages)
+{
+        unsigned long pages = spanned_pages;
+        /*
+         * Provide a more accurate estimation if there are holes within
+         * the zone and SPARSEMEM is in use. If there are holes within the
+         * zone, each populated memory region may cost us one or two extra
+         * memmap pages due to alignment because memmap pages for each
+         * populated regions may not naturally algined on page boundary.
+         * So the (present_pages >> 4) heuristic is a tradeoff for that.
+         */
+        if (spanned_pages > present_pages + (present_pages >> 4) &&
+            IS_ENABLED(CONFIG_SPARSEMEM))
+                pages = present_pages;
+        return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
+}
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -4450,54 +4562,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+        spin_lock_init(&pgdat->numabalancing_migrate_lock);
+        pgdat->numabalancing_migrate_nr_pages = 0;
+        pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
        init_waitqueue_head(&pgdat->kswapd_wait);
        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
-                unsigned long size, realsize, memmap_pages;
+                unsigned long size, realsize, freesize, memmap_pages;
                size = zone_spanned_pages_in_node(nid, j, zones_size);
-                realsize = size - zone_absent_pages_in_node(nid, j,
+                realsize = freesize = size - zone_absent_pages_in_node(nid, j,
                                                                zholes_size);
                /*
-                 * Adjust realsize so that it accounts for how much memory
+                 * Adjust freesize so that it accounts for how much memory
                 * is used by this zone for memmap. This affects the watermark
                 * and per-cpu initialisations
                 */
-                memmap_pages =
+                memmap_pages = calc_memmap_size(size, realsize);
-                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
+                if (freesize >= memmap_pages) {
-                if (realsize >= memmap_pages) {
+                        freesize -= memmap_pages;
-                        realsize -= memmap_pages;
                        if (memmap_pages)
                                printk(KERN_DEBUG
                                       "  %s zone: %lu pages used for memmap\n",
                                       zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
-                                "  %s zone: %lu pages exceeds realsize %lu\n",
+                                "  %s zone: %lu pages exceeds freesize %lu\n",
-                                zone_names[j], memmap_pages, realsize);
+                                zone_names[j], memmap_pages, freesize);
                /* Account for reserved pages */
-                if (j == 0 && realsize > dma_reserve) {
+                if (j == 0 && freesize > dma_reserve) {
-                        realsize -= dma_reserve;
+                        freesize -= dma_reserve;
                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
                                        zone_names[0], dma_reserve);
                }
                if (!is_highmem_idx(j))
-                        nr_kernel_pages += realsize;
+                        nr_kernel_pages += freesize;
-                nr_all_pages += realsize;
+                /* Charge for highmem memmap if there are enough kernel pages */
+                else if (nr_kernel_pages > memmap_pages * 2)
+                        nr_kernel_pages -= memmap_pages;
+                nr_all_pages += freesize;
                zone->spanned_pages = size;
-                zone->present_pages = realsize;
+                zone->present_pages = freesize;
+                /*
+                 * Set an approximate value for lowmem here, it will be adjusted
+                 * when the bootmem allocator frees pages into the buddy system.
+                 * And all highmem pages will be managed by the buddy system.
+                 */
+                zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
 #ifdef CONFIG_NUMA
                zone->node = nid;
-                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
+                zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
                                                / 100;
-                zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
+                zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
 #endif
                zone->name = zone_names[j];
                spin_lock_init(&zone->lock);
@@ -4688,7 +4813,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
 /*
 * early_calculate_totalpages()
 * Sum pages in active regions for movable zone.
- * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ * Populate N_MEMORY for calculating usable_nodes.
 */
 static unsigned long __init early_calculate_totalpages(void)
 {
@@ -4701,7 +4826,7 @@ static unsigned long __init early_calculate_totalpages(void)
                totalpages += pages;
                if (pages)
-                        node_set_state(nid, N_HIGH_MEMORY);
+                        node_set_state(nid, N_MEMORY);
        }
        return totalpages;
 }
@@ -4718,9 +4843,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
        unsigned long usable_startpfn;
        unsigned long kernelcore_node, kernelcore_remaining;
        /* save the state before borrow the nodemask */
-        nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
+        nodemask_t saved_node_state = node_states[N_MEMORY];
        unsigned long totalpages = early_calculate_totalpages();
-        int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
        /*
         * If movablecore was specified, calculate what size of
@@ -4755,7 +4880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 restart:
        /* Spread kernelcore memory as evenly as possible throughout nodes */
        kernelcore_node = required_kernelcore / usable_nodes;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;
                /*
@@ -4847,23 +4972,27 @@ restart:
 out:
        /* restore the node_state */
-        node_states[N_HIGH_MEMORY] = saved_node_state;
+        node_states[N_MEMORY] = saved_node_state;
 }
-/* Any regular memory on that node ? */
+/* Any regular or high memory on that node ? */
-static void __init check_for_regular_memory(pg_data_t *pgdat)
+static void check_for_memory(pg_data_t *pgdat, int nid)
 {
-#ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
-        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+        if (N_MEMORY == N_NORMAL_MEMORY)
+                return;
+        for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
                if (zone->present_pages) {
-                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+                        node_set_state(nid, N_HIGH_MEMORY);
+                        if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
+                            zone_type <= ZONE_NORMAL)
+                                node_set_state(nid, N_NORMAL_MEMORY);
                        break;
                }
        }
-#endif
 }
 /**
@@ -4946,8 +5075,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                /* Any memory on that node */
                if (pgdat->node_present_pages)
-                        node_set_state(nid, N_HIGH_MEMORY);
+                        node_set_state(nid, N_MEMORY);
-                check_for_regular_memory(pgdat);
+                check_for_memory(pgdat, nid);
        }
 }
@@ -5175,10 +5304,6 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
-                zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
-                zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
-                zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -5576,7 +5701,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
 * expect this function should be exact.
 */
-bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
+                         bool skip_hwpoisoned_pages)
 {
        unsigned long pfn, iter, found;
        int mt;
@@ -5611,6 +5737,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
                        continue;
                }
+                /*
+                 * The HWPoisoned page may be not in buddy system, and
+                 * page_count() is not 0.
+                 */
+                if (skip_hwpoisoned_pages && PageHWPoison(page))
+                        continue;
                if (!PageLRU(page))
                        found++;
                /*
@@ -5653,7 +5786,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return !has_unmovable_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0, true);
 }
 #ifdef CONFIG_CMA
@@ -5680,7 +5813,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
        unsigned int tries = 0;
        int ret = 0;
-        migrate_prep_local();
+        migrate_prep();
        while (pfn < end || !list_empty(&cc->migratepages)) {
                if (fatal_signal_pending(current)) {
@@ -5708,61 +5841,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                ret = migrate_pages(&cc->migratepages,
                                    alloc_migrate_target,
-                                    0, false, MIGRATE_SYNC);
+                                    0, false, MIGRATE_SYNC,
+                                    MR_CMA);
        }
-        putback_lru_pages(&cc->migratepages);
+        putback_movable_pages(&cc->migratepages);
        return ret > 0 ? 0 : ret;
 }
-/*
- * Update zone's cma pages counter used for watermark level calculation.
- */
-static inline void __update_cma_watermarks(struct zone *zone, int count)
-{
-        unsigned long flags;
-        spin_lock_irqsave(&zone->lock, flags);
-        zone->min_cma_pages += count;
-        spin_unlock_irqrestore(&zone->lock, flags);
-        setup_per_zone_wmarks();
-}
-/*
- * Trigger memory pressure bump to reclaim some pages in order to be able to
- * allocate 'count' pages in single page units. Does similar work as
- *__alloc_pages_slowpath() function.
- */
-static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
-{
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-        struct zonelist *zonelist = node_zonelist(0, gfp_mask);
-        int did_some_progress = 0;
-        int order = 1;
-        /*
-         * Increase level of watermarks to force kswapd do his job
-         * to stabilise at new watermark level.
-         */
-        __update_cma_watermarks(zone, count);
-        /* Obey watermarks as if the page was being allocated */
-        while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
-                wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
-                did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
-                                                      NULL);
-                if (!did_some_progress) {
-                        /* Exhausted what can be done so it's blamo time */
-                        out_of_memory(zonelist, gfp_mask, order, NULL, false);
-                }
-        }
-        /* Restore original watermark levels. */
-        __update_cma_watermarks(zone, -count);
-        return count;
-}
 /**
 * alloc_contig_range() -- tries to allocate given range of pages
 * @start:      start PFN to allocate
@@ -5786,7 +5872,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
 int alloc_contig_range(unsigned long start, unsigned long end,
                       unsigned migratetype)
 {
-        struct zone *zone = page_zone(pfn_to_page(start));
        unsigned long outer_start, outer_end;
        int ret = 0, order;
@@ -5824,7 +5909,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         */
        ret = start_isolate_page_range(pfn_max_align_down(start),
-                                       pfn_max_align_up(end), migratetype);
+                                       pfn_max_align_up(end), migratetype,
+                                       false);
        if (ret)
                return ret;
@@ -5863,18 +5949,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
        }
        /* Make sure the range is really isolated. */
-        if (test_pages_isolated(outer_start, end)) {
+        if (test_pages_isolated(outer_start, end, false)) {
                pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
                       outer_start, end);
                ret = -EBUSY;
                goto done;
        }
-        /*
-         * Reclaim enough pages to make sure that contiguous allocation
-         * will not starve the system.
-         */
-        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
        /* Grab isolated pages from freelists. */
        outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5897,8 +5978,15 @@ done:
 void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
-        for (; nr_pages--; ++pfn)
+        unsigned int count = 0;
-                __free_page(pfn_to_page(pfn));
+        for (; nr_pages--; pfn++) {
+                struct page *page = pfn_to_page(pfn);
+                count += page_count(page) != 1;
+                __free_page(page);
+        }
+        WARN(count != 0, "%d pages are still in use!\n", count);
 }
 #endif
@@ -5932,7 +6020,6 @@ void __meminit zone_pcp_update(struct zone *zone)
 }
 #endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
 void zone_pcp_reset(struct zone *zone)
 {
        unsigned long flags;
@@ -5952,6 +6039,7 @@ void zone_pcp_reset(struct zone *zone)
        local_irq_restore(flags);
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /*
 * All pages in the range must be isolated before calling this.
 */
@@ -5978,6 +6066,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                        continue;
                }
                page = pfn_to_page(pfn);
+                /*
+                 * The HWPoisoned page may be not in buddy system, and
+                 * page_count() is not 0.
+                 */
+                if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
+                        pfn++;
+                        SetPageReserved(page);
+                        continue;
+                }
                BUG_ON(page_count(page));
                BUG_ON(!PageBuddy(page));
                order = page_order(page);
@@ -5988,8 +6086,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
                list_del(&page->lru);
                rmv_page_order(page);
                zone->free_area[order].nr_free--;
-                __mod_zone_page_state(zone, NR_FREE_PAGES,
-                                      - (1UL << order));
                for (i = 0; i < (1 << order); i++)
                        SetPageReserved((page+i));
                pfn += (1 << order);