1 files changed, 201 insertions, 293 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef44ad736ca1..736d8e1b6381 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
 #include <linux/kmemleak.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
-#include <linux/ftrace_event.h>
-#include <linux/memcontrol.h>
 #include <linux/prefetch.h>
 #include <linux/mm_inline.h>
 #include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
 */
 DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
 #endif
 /*
@@ -680,9 +679,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        int migratetype = 0;
        int batch_free = 0;
        int to_free = count;
+        unsigned long nr_scanned;
        spin_lock(&zone->lock);
-        zone->pages_scanned = 0;
+        nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+        if (nr_scanned)
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
        while (to_free) {
                struct page *page;
@@ -731,8 +733,11 @@ static void free_one_page(struct zone *zone,
                                unsigned int order,
                                int migratetype)
 {
+        unsigned long nr_scanned;
        spin_lock(&zone->lock);
-        zone->pages_scanned = 0;
+        nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
+        if (nr_scanned)
+                __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
        __free_one_page(page, pfn, zone, order, migratetype);
        if (unlikely(!is_migrate_isolate(migratetype)))
@@ -1008,7 +1013,7 @@ int move_freepages(struct zone *zone,
         * Remove at a later date when no bug reports exist related to
         * grouping pages by mobility
         */
-        BUG_ON(page_zone(start_page) != page_zone(end_page));
+        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
 #endif
        for (page = start_page; page <= end_page;) {
@@ -1257,15 +1262,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
        unsigned long flags;
-        int to_drain;
+        int to_drain, batch;
-        unsigned long batch;
        local_irq_save(flags);
        batch = ACCESS_ONCE(pcp->batch);
-        if (pcp->count >= batch)
+        to_drain = min(pcp->count, batch);
-                to_drain = batch;
-        else
-                to_drain = pcp->count;
        if (to_drain > 0) {
                free_pcppages_bulk(zone, to_drain, pcp);
                pcp->count -= to_drain;
@@ -1610,6 +1611,9 @@ again:
        }
        __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
+        if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
+            !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
+                set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1712,7 +1716,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        long free_cma = 0;
@@ -1727,7 +1730,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 #endif
-        if (free_pages - free_cma <= min + lowmem_reserve)
+        if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1922,6 +1925,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 #endif  /* CONFIG_NUMA */
+static void reset_alloc_batches(struct zone *preferred_zone)
+{
+        struct zone *zone = preferred_zone->zone_pgdat->node_zones;
+        do {
+                mod_zone_page_state(zone, NR_ALLOC_BATCH,
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+                clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
+        } while (zone++ != preferred_zone);
+}
 /*
 * get_page_from_freelist goes through the zonelist trying to allocate
 * a page.
@@ -1939,8 +1954,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
        bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
                                (gfp_mask & __GFP_WRITE);
+        int nr_fair_skipped = 0;
+        bool zonelist_rescan;
 zonelist_scan:
+        zonelist_rescan = false;
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
@@ -1964,9 +1983,11 @@ zonelist_scan:
                 */
                if (alloc_flags & ALLOC_FAIR) {
                        if (!zone_local(preferred_zone, zone))
+                                break;
+                        if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
+                                nr_fair_skipped++;
                                continue;
-                        if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+                        }
-                                continue;
                }
                /*
                 * When allocating a page cache page for writing, we
@@ -2072,13 +2093,7 @@ this_zone_full:
                        zlc_mark_zone_full(zonelist, z);
        }
-        if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
+        if (page) {
-                /* Disable zlc cache for second zonelist scan */
-                zlc_active = 0;
-                goto zonelist_scan;
-        }
-        if (page)
                /*
                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
                 * necessary to allocate the page. The expectation is
@@ -2087,8 +2102,37 @@ this_zone_full:
                 * for !PFMEMALLOC purposes.
                 */
                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+                return page;
+        }
-        return page;
+        /*
+         * The first pass makes sure allocations are spread fairly within the
+         * local node.  However, the local node might have free pages left
+         * after the fairness batches are exhausted, and remote zones haven't
+         * even been considered yet.  Try once more without fairness, and
+         * include remote zones now, before entering the slowpath and waking
+         * kswapd: prefer spilling to a remote zone over swapping locally.
+         */
+        if (alloc_flags & ALLOC_FAIR) {
+                alloc_flags &= ~ALLOC_FAIR;
+                if (nr_fair_skipped) {
+                        zonelist_rescan = true;
+                        reset_alloc_batches(preferred_zone);
+                }
+                if (nr_online_nodes > 1)
+                        zonelist_rescan = true;
+        }
+        if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
+                /* Disable zlc cache for second zonelist scan */
+                zlc_active = 0;
+                zonelist_rescan = true;
+        }
+        if (zonelist_rescan)
+                goto zonelist_scan;
+        return NULL;
 }
 /*
@@ -2201,8 +2245,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page;
-        /* Acquire the OOM killer lock for the zones in zonelist */
+        /* Acquire the per-zone oom lock for each zone */
-        if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
+        if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
@@ -2240,7 +2284,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
-        clear_zonelist_oom(zonelist, gfp_mask);
+        oom_zonelist_unlock(zonelist, gfp_mask);
        return page;
 }
@@ -2251,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int classzone_idx, int migratetype, enum migrate_mode mode,
-        bool *contended_compaction, bool *deferred_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        unsigned long *did_some_progress)
 {
-        if (!order)
+        struct zone *last_compact_zone = NULL;
-                return NULL;
+        unsigned long compact_result;
+        struct page *page;
-        if (compaction_deferred(preferred_zone, order)) {
+        if (!order)
-                *deferred_compaction = true;
                return NULL;
-        }
        current->flags |= PF_MEMALLOC;
-        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+        compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, mode,
-                                                contended_compaction);
+                                                contended_compaction,
+                                                &last_compact_zone);
        current->flags &= ~PF_MEMALLOC;
-        if (*did_some_progress != COMPACT_SKIPPED) {
+        switch (compact_result) {
-                struct page *page;
+        case COMPACT_DEFERRED:
+                *deferred_compaction = true;
+                /* fall-through */
+        case COMPACT_SKIPPED:
+                return NULL;
+        default:
+                break;
+        }
-                /* Page migration frees to the PCP lists but we want merging */
+        /*
-                drain_pages(get_cpu());
+         * At least in one zone compaction wasn't deferred or skipped, so let's
-                put_cpu();
+         * count a compaction stall
+         */
+        count_vm_event(COMPACTSTALL);
-                page = get_page_from_freelist(gfp_mask, nodemask,
+        /* Page migration frees to the PCP lists but we want merging */
-                                order, zonelist, high_zoneidx,
+        drain_pages(get_cpu());
-                                alloc_flags & ~ALLOC_NO_WATERMARKS,
+        put_cpu();
-                                preferred_zone, classzone_idx, migratetype);
-                if (page) {
-                        preferred_zone->compact_blockskip_flush = false;
-                        compaction_defer_reset(preferred_zone, order, true);
-                        count_vm_event(COMPACTSUCCESS);
-                        return page;
-                }
-                /*
+        page = get_page_from_freelist(gfp_mask, nodemask,
-                 * It's bad if compaction run occurs and fails.
+                        order, zonelist, high_zoneidx,
-                 * The most likely reason is that pages exist,
+                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                 * but not enough to satisfy watermarks.
+                        preferred_zone, classzone_idx, migratetype);
-                 */
-                count_vm_event(COMPACTFAIL);
-                /*
+        if (page) {
-                 * As async compaction considers a subset of pageblocks, only
+                struct zone *zone = page_zone(page);
-                 * defer if the failure was a sync compaction failure.
-                 */
-                if (mode != MIGRATE_ASYNC)
-                        defer_compaction(preferred_zone, order);
-                cond_resched();
+                zone->compact_blockskip_flush = false;
+                compaction_defer_reset(zone, order, true);
+                count_vm_event(COMPACTSUCCESS);
+                return page;
        }
+        /*
+         * last_compact_zone is where try_to_compact_pages thought allocation
+         * should succeed, so it did not defer compaction. But here we know
+         * that it didn't succeed, so we do the defer.
+         */
+        if (last_compact_zone && mode != MIGRATE_ASYNC)
+                defer_compaction(last_compact_zone, order);
+        /*
+         * It's bad if compaction run occurs and fails. The most likely reason
+         * is that pages exist, but not enough to satisfy watermarks.
+         */
+        count_vm_event(COMPACTFAIL);
+        cond_resched();
        return NULL;
 }
 #else
@@ -2310,9 +2368,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int classzone_idx, int migratetype,
+        int classzone_idx, int migratetype, enum migrate_mode mode,
-        enum migrate_mode mode, bool *contended_compaction,
+        int *contended_compaction, bool *deferred_compaction)
-        bool *deferred_compaction, unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2409,37 +2466,17 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
        return page;
 }
-static void reset_alloc_batches(struct zonelist *zonelist,
-                                enum zone_type high_zoneidx,
-                                struct zone *preferred_zone)
-{
-        struct zoneref *z;
-        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                /*
-                 * Only reset the batches of zones that were actually
-                 * considered in the fairness pass, we don't want to
-                 * trash fairness information for zones that are not
-                 * actually part of this zonelist's round-robin cycle.
-                 */
-                if (!zone_local(preferred_zone, zone))
-                        continue;
-                mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-        }
-}
 static void wake_all_kswapds(unsigned int order,
                             struct zonelist *zonelist,
                             enum zone_type high_zoneidx,
-                             struct zone *preferred_zone)
+                             struct zone *preferred_zone,
+                             nodemask_t *nodemask)
 {
        struct zoneref *z;
        struct zone *zone;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                                high_zoneidx, nodemask)
                wakeup_kswapd(zone, order, zone_idx(preferred_zone));
 }
@@ -2486,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
 #ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
                alloc_flags |= ALLOC_CMA;
 #endif
        return alloc_flags;
@@ -2510,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        enum migrate_mode migration_mode = MIGRATE_ASYNC;
        bool deferred_compaction = false;
-        bool contended_compaction = false;
+        int contended_compaction = COMPACT_CONTENDED_NONE;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2537,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 restart:
        if (!(gfp_mask & __GFP_NO_KSWAPD))
-                wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+                wake_all_kswapds(order, zonelist, high_zoneidx,
+                                preferred_zone, nodemask);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2610,29 +2648,50 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
        if (page)
                goto got_pg;
+        /* Checks for THP-specific high-order allocations */
+        if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+                /*
+                 * If compaction is deferred for high-order allocations, it is
+                 * because sync compaction recently failed. If this is the case
+                 * and the caller requested a THP allocation, we do not want
+                 * to heavily disrupt the system, so we fail the allocation
+                 * instead of entering direct reclaim.
+                 */
+                if (deferred_compaction)
+                        goto nopage;
+                /*
+                 * In all zones where compaction was attempted (and not
+                 * deferred or skipped), lock contention has been detected.
+                 * For THP allocation we do not want to disrupt the others
+                 * so we fallback to base pages instead.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_LOCK)
+                        goto nopage;
+                /*
+                 * If compaction was aborted due to need_resched(), we do not
+                 * want to further increase allocation latency, unless it is
+                 * khugepaged trying to collapse.
+                 */
+                if (contended_compaction == COMPACT_CONTENDED_SCHED
+                        && !(current->flags & PF_KTHREAD))
+                        goto nopage;
+        }
        /*
         * It can become very expensive to allocate transparent hugepages at
         * fault, so use asynchronous memory compaction for THP unless it is
         * khugepaged trying to collapse.
         */
-        if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD))
+        if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
+                                                (current->flags & PF_KTHREAD))
                migration_mode = MIGRATE_SYNC_LIGHT;
-        /*
-         * If compaction is deferred for high-order allocations, it is because
-         * sync compaction recently failed. In this is the case and the caller
-         * requested a movable allocation that does not heavily disrupt the
-         * system then fail the allocation instead of entering direct reclaim.
-         */
-        if ((deferred_compaction || contended_compaction) &&
-                                                (gfp_mask & __GFP_NO_KSWAPD))
-                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2702,8 +2761,7 @@ rebalance:
                                        preferred_zone,
                                        classzone_idx, migratetype,
                                        migration_mode, &contended_compaction,
-                                        &deferred_compaction,
+                                        &deferred_compaction);
-                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2729,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        struct zone *preferred_zone;
        struct zoneref *preferred_zoneref;
        struct page *page = NULL;
-        int migratetype = allocflags_to_migratetype(gfp_mask);
+        int migratetype = gfpflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
        int classzone_idx;
@@ -2751,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+                alloc_flags |= ALLOC_CMA;
 retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2762,33 +2823,12 @@ retry_cpuset:
                goto out;
        classzone_idx = zonelist_zone_idx(preferred_zoneref);
-#ifdef CONFIG_CMA
-        if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
-                alloc_flags |= ALLOC_CMA;
-#endif
-retry:
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
                        zonelist, high_zoneidx, alloc_flags,
                        preferred_zone, classzone_idx, migratetype);
        if (unlikely(!page)) {
                /*
-                 * The first pass makes sure allocations are spread
-                 * fairly within the local node.  However, the local
-                 * node might have free pages left after the fairness
-                 * batches are exhausted, and remote zones haven't
-                 * even been considered yet.  Try once more without
-                 * fairness, and include remote zones now, before
-                 * entering the slowpath and waking kswapd: prefer
-                 * spilling to a remote zone over swapping locally.
-                 */
-                if (alloc_flags & ALLOC_FAIR) {
-                        reset_alloc_batches(zonelist, high_zoneidx,
-                                            preferred_zone);
-                        alloc_flags &= ~ALLOC_FAIR;
-                        goto retry;
-                }
-                /*
                 * Runtime PM, block IO and its error handling path
                 * can deadlock because I/O on the device might not
                 * complete.
@@ -2962,7 +3002,7 @@ EXPORT_SYMBOL(alloc_pages_exact);
 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
 * but is not exact.
 */
-void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
 {
        unsigned order = get_order(size);
        struct page *p = alloc_pages_node(nid, gfp_mask, order);
@@ -2970,7 +3010,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
                return NULL;
        return make_alloc_exact((unsigned long)page_address(p), order, size);
 }
-EXPORT_SYMBOL(alloc_pages_exact_nid);
 /**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
@@ -3052,7 +3091,7 @@ static inline void show_node(struct zone *zone)
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
-        val->sharedram = 0;
+        val->sharedram = global_page_state(NR_SHMEM);
        val->freeram = global_page_state(NR_FREE_PAGES);
        val->bufferram = nr_blockdev_pages();
        val->totalhigh = totalhigh_pages;
@@ -3072,6 +3111,7 @@ void si_meminfo_node(struct sysinfo *val, int nid)
        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
                managed_pages += pgdat->node_zones[zone_type].managed_pages;
        val->totalram = managed_pages;
+        val->sharedram = node_page_state(nid, NR_SHMEM);
        val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
        val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3253,12 +3293,12 @@ void show_free_areas(unsigned int filter)
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
-                        zone->pages_scanned,
+                        K(zone_page_state(zone, NR_PAGES_SCANNED)),
                        (!zone_reclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
-                        printk(" %lu", zone->lowmem_reserve[i]);
+                        printk(" %ld", zone->lowmem_reserve[i]);
                printk("\n");
        }
@@ -3572,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
        zonelist->_zonerefs[pos].zone_idx = 0;
 }
+#if defined(CONFIG_64BIT)
+/*
+ * Devices that require DMA32/DMA are relatively rare and do not justify a
+ * penalty to every machine in case the specialised case applies. Default
+ * to Node-ordering on 64-bit NUMA machines
+ */
+static int default_zonelist_order(void)
+{
+        return ZONELIST_ORDER_NODE;
+}
+#else
+/*
+ * On 32-bit, the Normal zone needs to be preserved for allocations accessible
+ * by the kernel. If processes running on node 0 deplete the low memory zone
+ * then reclaim will occur more frequency increasing stalls and potentially
+ * be easier to OOM if a large percentage of the zone is under writeback or
+ * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
+ * Hence, default to zone ordering on 32-bit.
+ */
 static int default_zonelist_order(void)
 {
-        int nid, zone_type;
-        unsigned long low_kmem_size, total_size;
-        struct zone *z;
-        int average_size;
-        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
-         * If they are really small and used heavily, the system can fall
-         * into OOM very easily.
-         * This function detect ZONE_DMA/DMA32 size and configures zone order.
-         */
-        /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
-        low_kmem_size = 0;
-        total_size = 0;
-        for_each_online_node(nid) {
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->managed_pages;
-                                total_size += z->managed_pages;
-                        } else if (zone_type == ZONE_NORMAL) {
-                                /*
-                                 * If any node has only lowmem, then node order
-                                 * is preferred to allow kernel allocations
-                                 * locally; otherwise, they can easily infringe
-                                 * on other nodes when there is an abundance of
-                                 * lowmem available to allocate from.
-                                 */
-                                return ZONELIST_ORDER_NODE;
-                        }
-                }
-        }
-        if (!low_kmem_size ||  /* there are no DMA area. */
-            low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
-                return ZONELIST_ORDER_NODE;
-        /*
-         * look into each node's config.
-         * If there is a node whose DMA/DMA32 memory is very big area on
-         * local memory, NODE_ORDER may be suitable.
-         */
-        average_size = total_size /
-                                (nodes_weight(node_states[N_MEMORY]) + 1);
-        for_each_online_node(nid) {
-                low_kmem_size = 0;
-                total_size = 0;
-                for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
-                        z = &NODE_DATA(nid)->node_zones[zone_type];
-                        if (populated_zone(z)) {
-                                if (zone_type < ZONE_NORMAL)
-                                        low_kmem_size += z->present_pages;
-                                total_size += z->present_pages;
-                        }
-                }
-                if (low_kmem_size &&
-                    total_size > average_size && /* ignore small node */
-                    low_kmem_size > total_size * 70/100)
-                        return ZONELIST_ORDER_NODE;
-        }
        return ZONELIST_ORDER_ZONE;
 }
+#endif /* CONFIG_64BIT */
 static void set_zonelist_order(void)
 {
@@ -4969,6 +4971,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        pgdat->node_start_pfn = node_start_pfn;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+        printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+                        (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
 #endif
        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                  zones_size, zholes_size);
@@ -5579,7 +5583,7 @@ static void calculate_totalreserve_pages(void)
        for_each_online_pgdat(pgdat) {
                for (i = 0; i < MAX_NR_ZONES; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        unsigned long max = 0;
+                        long max = 0;
                        /* Find valid and maximum lowmem_reserve in the zone */
                        for (j = i; j < MAX_NR_ZONES; j++) {
@@ -5694,9 +5698,8 @@ static void __setup_per_zone_wmarks(void)
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
                __mod_zone_page_state(zone, NR_ALLOC_BATCH,
-                                      high_wmark_pages(zone) -
+                        high_wmark_pages(zone) - low_wmark_pages(zone) -
-                                      low_wmark_pages(zone) -
+                        atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
-                                      zone_page_state(zone, NR_ALLOC_BATCH));
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
@@ -6271,8 +6274,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
                if (list_empty(&cc->migratepages)) {
                        cc->nr_migratepages = 0;
-                        pfn = isolate_migratepages_range(cc->zone, cc,
+                        pfn = isolate_migratepages_range(cc, pfn, end);
-                                                         pfn, end, true);
                        if (!pfn) {
                                ret = -EINTR;
                                break;
@@ -6548,97 +6550,3 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
 #endif
-static const struct trace_print_flags pageflag_names[] = {
-        {1UL << PG_locked,              "locked"        },
-        {1UL << PG_error,               "error"         },
-        {1UL << PG_referenced,          "referenced"    },
-        {1UL << PG_uptodate,            "uptodate"      },
-        {1UL << PG_dirty,               "dirty"         },
-        {1UL << PG_lru,                 "lru"           },
-        {1UL << PG_active,              "active"        },
-        {1UL << PG_slab,                "slab"          },
-        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
-        {1UL << PG_arch_1,              "arch_1"        },
-        {1UL << PG_reserved,            "reserved"      },
-        {1UL << PG_private,             "private"       },
-        {1UL << PG_private_2,           "private_2"     },
-        {1UL << PG_writeback,           "writeback"     },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
-        {1UL << PG_head,                "head"          },
-        {1UL << PG_tail,                "tail"          },
-#else
-        {1UL << PG_compound,            "compound"      },
-#endif
-        {1UL << PG_swapcache,           "swapcache"     },
-        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
-        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_swapbacked,          "swapbacked"    },
-        {1UL << PG_unevictable,         "unevictable"   },
-#ifdef CONFIG_MMU
-        {1UL << PG_mlocked,             "mlocked"       },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
-        {1UL << PG_uncached,            "uncached"      },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
-        {1UL << PG_hwpoison,            "hwpoison"      },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-        {1UL << PG_compound_lock,       "compound_lock" },
-#endif
-};
-static void dump_page_flags(unsigned long flags)
-{
-        const char *delim = "";
-        unsigned long mask;
-        int i;
-        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-        printk(KERN_ALERT "page flags: %#lx(", flags);
-        /* remove zone id */
-        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
-                mask = pageflag_names[i].mask;
-                if ((flags & mask) != mask)
-                        continue;
-                flags &= ~mask;
-                printk("%s%s", delim, pageflag_names[i].name);
-                delim = "|";
-        }
-        /* check for left over flags */
-        if (flags)
-                printk("%s%#lx", delim, flags);
-        printk(")\n");
-}
-void dump_page_badflags(struct page *page, const char *reason,
-                unsigned long badflags)
-{
-        printk(KERN_ALERT
-               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, atomic_read(&page->_count), page_mapcount(page),
-                page->mapping, page->index);
-        dump_page_flags(page->flags);
-        if (reason)
-                pr_alert("page dumped because: %s\n", reason);
-        if (page->flags & badflags) {
-                pr_alert("bad because of flags:\n");
-                dump_page_flags(page->flags & badflags);
-        }
-        mem_cgroup_print_bad_page(page);
-}
-void dump_page(struct page *page, const char *reason)
-{
-        dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);