1 files changed, 433 insertions, 196 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..4e8985acdab8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
@@ -29,6 +30,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
@@ -38,6 +40,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
@@ -52,6 +55,8 @@
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -103,19 +108,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
-void set_gfp_allowed_mask(gfp_t mask)
+static gfp_t saved_gfp_mask;
+void pm_restore_gfp_mask(void)
 {
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask = mask;
+        if (saved_gfp_mask) {
+                gfp_allowed_mask = saved_gfp_mask;
+                saved_gfp_mask = 0;
+        }
 }
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
 {
-        gfp_t ret = gfp_allowed_mask;
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask &= ~mask;
+        WARN_ON(saved_gfp_mask);
-        return ret;
+        saved_gfp_mask = gfp_allowed_mask;
+        gfp_allowed_mask &= ~GFP_IOFS;
 }
 #endif /* CONFIG_PM_SLEEP */
@@ -280,7 +290,7 @@ static void bad_page(struct page *page)
        /* Don't complain about poisoned pages */
        if (PageHWPoison(page)) {
-                __ClearPageBuddy(page);
+                reset_page_mapcount(page); /* remove PageBuddy */
                return;
        }
@@ -311,7 +321,7 @@ static void bad_page(struct page *page)
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
-        __ClearPageBuddy(page);
+        reset_page_mapcount(page); /* remove PageBuddy */
        add_taint(TAINT_BAD_PAGE);
 }
@@ -351,6 +361,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -420,18 +431,10 @@ static inline void rmv_page_order(struct page *page)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-        unsigned long buddy_idx = page_idx ^ (1 << order);
-        return page + (buddy_idx - page_idx);
-}
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-        return (page_idx & ~(1 << order));
+        return page_idx ^ (1 << order);
 }
 /*
@@ -442,8 +445,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we use PG_buddy.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -476,7 +479,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
@@ -493,6 +496,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        if (unlikely(PageCompound(page)))
@@ -507,7 +511,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                buddy = __page_find_buddy(page, page_idx, order);
+                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -515,7 +520,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -530,11 +535,12 @@ static inline void __free_one_page(struct page *page,
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-        if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
-                higher_page = page + combined_idx - page_idx;
+                higher_page = page + (combined_idx - page_idx);
-                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -563,7 +569,8 @@ static inline int free_pages_check(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0) |
-                (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+                (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+                (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -612,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        list = &pcp->lists[migratetype];
                } while (list_empty(list));
+                /* This is the only non-empty list. Free them all. */
+                if (batch_free == MIGRATE_PCPTYPES)
+                        batch_free = to_free;
                do {
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
@@ -645,13 +656,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0; i < (1 << order); i++) {
+        if (PageAnon(page))
-                struct page *pg = page + i;
+                page->mapping = NULL;
+        for (i = 0; i < (1 << order); i++)
-                if (PageAnon(pg))
+                bad += free_pages_check(page + i);
-                        pg->mapping = NULL;
-                bad += free_pages_check(pg);
-        }
        if (bad)
                return false;
@@ -751,7 +759,8 @@ static inline int check_new_page(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+                (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+                (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -864,9 +873,8 @@ static int move_freepages(struct zone *zone,
                }
                order = page_order(page);
-                list_del(&page->lru);
+                list_move(&page->lru,
-                list_add(&page->lru,
+                          &zone->free_area[order].free_list[migratetype]);
-                        &zone->free_area[order].free_list[migratetype]);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -937,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                         * If breaking a large block of pages, move all free
                         * pages to the preferred allocation list. If falling
                         * back for a reclaimable kernel allocation, be more
-                         * agressive about taking ownership of free pages
+                         * aggressive about taking ownership of free pages
                         */
                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
                                        start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1089,8 +1097,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -1332,7 +1342,7 @@ again:
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
-        zone_statistics(preferred_zone, zone);
+        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
        VM_BUG_ON(bad_range(zone, page));
@@ -1454,24 +1464,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1490,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -1694,6 +1723,59 @@ try_next_zone:
        return page;
 }
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+        bool ret = false;
+#if NODES_SHIFT > 8
+        ret = in_interrupt();
+#endif
+        return ret;
+}
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+                DEFAULT_RATELIMIT_INTERVAL,
+                DEFAULT_RATELIMIT_BURST);
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+        va_list args;
+        unsigned int filter = SHOW_MEM_FILTER_NODES;
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+                return;
+        /*
+         * This documents exceptions given to allocations in certain
+         * contexts that are allowed to allocate outside current's set
+         * of allowed nodes.
+         */
+        if (!(gfp_mask & __GFP_NOMEMALLOC))
+                if (test_thread_flag(TIF_MEMDIE) ||
+                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                        filter &= ~SHOW_MEM_FILTER_NODES;
+        if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+                filter &= ~SHOW_MEM_FILTER_NODES;
+        if (fmt) {
+                printk(KERN_WARNING);
+                va_start(args, fmt);
+                vprintk(fmt, args);
+                va_end(args);
+        }
+        pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+                   current->comm, order, gfp_mask);
+        dump_stack();
+        if (!should_suppress_show_mem())
+                show_mem(filter);
+}
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                unsigned long pages_reclaimed)
@@ -1787,15 +1869,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        struct page *page;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                                nodemask);
+                                                nodemask, sync_migration);
+        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -1831,7 +1916,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        return NULL;
 }
@@ -1846,23 +1932,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
        bool drained = false;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        p->flags |= PF_MEMALLOC;
+        current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
+        current->reclaim_state = &reclaim_state;
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
-        p->reclaim_state = NULL;
+        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
+        current->flags &= ~PF_MEMALLOC;
        cond_resched();
@@ -1906,7 +1991,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        congestion_wait(BLK_RW_ASYNC, HZ/50);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
@@ -1914,24 +1999,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx)
+                                                enum zone_type high_zoneidx,
+                                                enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(zone, order);
+                wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-        struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-        BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
        /*
         * The caller may dip into page reserves a bit more if the caller
@@ -1939,21 +2024,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-        alloc_flags |= (gfp_mask & __GFP_HIGH);
+        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
-                alloc_flags |= ALLOC_HARDER;
+                /*
+                 * Not worth trying to allocate harder for
+                 * __GFP_NOMEMALLOC even if it can't schedule.
+                 */
+                if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)) && !in_interrupt())
+        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                    ((p->flags & PF_MEMALLOC) ||
+                    ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1972,7 +2062,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        struct task_struct *p = current;
+        bool sync_migration = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -1997,7 +2087,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx);
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2006,6 +2098,15 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
+rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2013,7 +2114,6 @@ restart:
        if (page)
                goto got_pg;
-rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
                page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2028,21 +2128,26 @@ rebalance:
                goto nopage;
        /* Avoid recursion of direct reclaim */
-        if (p->flags & PF_MEMALLOC)
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
-        /* Try direct compaction */
+        /*
+         * Try direct compaction. The first pass is asynchronous. Subsequent
+         * attempts after direct reclaim are synchronous
+         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
        if (page)
                goto got_pg;
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2094,18 +2199,26 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
+                if (page)
+                        goto got_pg;
        }
 nopage:
-        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+        warn_alloc_failed(gfp_mask, order, NULL);
-                printk(KERN_WARNING "%s: page allocation failure."
-                        " order:%d, mode:0x%x\n",
-                        p->comm, order, gfp_mask);
-                dump_stack();
-                show_mem();
-        }
        return page;
 got_pg:
        if (kmemcheck_enabled)
@@ -2145,7 +2258,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -2224,6 +2339,21 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+        if (addr) {
+                unsigned long alloc_end = addr + (PAGE_SIZE << order);
+                unsigned long used = addr + PAGE_ALIGN(size);
+                split_page(virt_to_page((void *)addr), order);
+                while (used < alloc_end) {
+                        free_page(used);
+                        used += PAGE_SIZE;
+                }
+        }
+        return (void *)addr;
+}
 /**
 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
 * @size: the number of bytes to allocate
@@ -2243,22 +2373,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
        unsigned long addr;
        addr = __get_free_pages(gfp_mask, order);
-        if (addr) {
+        return make_alloc_exact(addr, order, size);
-                unsigned long alloc_end = addr + (PAGE_SIZE << order);
-                unsigned long used = addr + PAGE_ALIGN(size);
-                split_page(virt_to_page((void *)addr), order);
-                while (used < alloc_end) {
-                        free_page(used);
-                        used += PAGE_SIZE;
-                }
-        }
-        return (void *)addr;
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ *                         pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+        unsigned order = get_order(size);
+        struct page *p = alloc_pages_node(nid, gfp_mask, order);
+        if (!p)
+                return NULL;
+        return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+/**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
 * @virt: the value returned by alloc_pages_exact.
 * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2352,19 +2493,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 }
 #endif
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+        bool ret = false;
+        if (!(flags & SHOW_MEM_FILTER_NODES))
+                goto out;
+        get_mems_allowed();
+        ret = !node_isset(nid, cpuset_current_mems_allowed);
+        put_mems_allowed();
+out:
+        return ret;
+}
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
 */
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
 {
        int cpu;
        struct zone *zone;
        for_each_populated_zone(zone) {
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s per-cpu:\n", zone->name);
@@ -2406,6 +2569,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                int i;
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -2436,7 +2601,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2473,6 +2638,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -2579,9 +2746,16 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        if (s)
+        int ret;
-                return __parse_numa_zonelist_order(s);
-        return 0;
+        if (!s)
+                return 0;
+        ret = __parse_numa_zonelist_order(s);
+        if (ret == 0)
+                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -3007,14 +3181,6 @@ static __init_refok int __build_all_zonelists(void *data)
                build_zonelist_cache(pgdat);
        }
-#ifdef CONFIG_MEMORY_HOTPLUG
-        /* Setup real pagesets for the new zone */
-        if (data) {
-                struct zone *zone = data;
-                setup_zone_pageset(zone);
-        }
-#endif
        /*
         * Initialize the boot_pagesets that are going to be used
         * for bootstrapping processors. The real pagesets for
@@ -3052,7 +3218,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
 {
        set_zonelist_order();
@@ -3063,7 +3229,11 @@ void build_all_zonelists(void *data)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+                if (data)
+                        setup_zone_pageset((struct zone *)data);
+#endif
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3159,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+                if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                        return 1;
+        }
+        return 0;
+}
+/*
 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
 * of blocks reserved is based on min_wmark_pages(zone). The memory within
 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3167,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
 */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
-        unsigned long start_pfn, pfn, end_pfn;
+        unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
@@ -3197,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                /* Blocks with reserved pages will never free, skip them. */
-                if (PageReserved(page))
+                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                if (pageblock_is_reserved(pfn, block_end_pfn))
                        continue;
                block_migratetype = get_pageblock_migratetype(page);
@@ -3386,7 +3571,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3436,7 +3621,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                        alloc_bootmem_node(pgdat, alloc_size);
+                        alloc_bootmem_node_nopanic(pgdat, alloc_size);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -3636,68 +3821,87 @@ void __init free_bootmem_with_active_regions(int nid,
        }
 }
-int __init add_from_early_node_map(struct range *range, int az,
+#ifdef CONFIG_HAVE_MEMBLOCK
-                                   int nr_range, int nid)
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
 {
        int i;
-        u64 start, end;
-        /* need to go over early_node_map to find out good range for node */
+        for (i = nr_nodemap_entries - 1; i >= 0; i--)
-        for_each_active_range_index_in_nid(i, nid) {
+                if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                start = early_node_map[i].start_pfn;
+                        return i;
-                end = early_node_map[i].end_pfn;
-                nr_range = add_range(range, az, nr_range, start, end);
+        return -1;
-        }
-        return nr_range;
 }
-#ifdef CONFIG_NO_BOOTMEM
+/*
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+        for (index = index - 1; index >= 0; index--)
+                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                        return index;
+        return -1;
+}
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+        for (i = last_active_region_index_in_nid(nid); i != -1; \
+                                i = previous_active_region_index_in_nid(i, nid))
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
 {
        int i;
-        void *ptr;
-        if (limit > get_max_mapped())
-                limit = get_max_mapped();
-        /* need to go over early_node_map to find out good range for node */
+        /* Need to go over early_node_map to find out good range for node */
-        for_each_active_range_index_in_nid(i, nid) {
+        for_each_active_range_index_in_nid_reverse(i, nid) {
                u64 addr;
                u64 ei_start, ei_last;
+                u64 final_start, final_end;
                ei_last = early_node_map[i].end_pfn;
                ei_last <<= PAGE_SHIFT;
                ei_start = early_node_map[i].start_pfn;
                ei_start <<= PAGE_SHIFT;
-                addr = find_early_area(ei_start, ei_last,
-                                         goal, limit, size, align);
-                if (addr == -1ULL)
+                final_start = max(ei_start, goal);
+                final_end = min(ei_last, limit);
+                if (final_start >= final_end)
                        continue;
-#if 0
+                addr = memblock_find_in_range(final_start, final_end, size, align);
-                printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
-                                nid,
-                                ei_start, ei_last, goal, limit, size,
-                                align, addr);
-#endif
-                ptr = phys_to_virt(addr);
+                if (addr == MEMBLOCK_ERROR)
-                memset(ptr, 0, size);
+                        continue;
-                reserve_early_without_check(addr, addr + size, "BOOTMEM");
-                /*
+                return addr;
-                 * The min_count is set to 0 so that bootmem allocated blocks
-                 * are never reported as leaks.
-                 */
-                kmemleak_alloc(ptr, size, 0, 0);
-                return ptr;
        }
-        return NULL;
+        return MEMBLOCK_ERROR;
 }
 #endif
+int __init add_from_early_node_map(struct range *range, int az,
+                                   int nr_range, int nid)
+{
+        int i;
+        u64 start, end;
+        /* need to go over early_node_map to find out good range for node */
+        for_each_active_range_index_in_nid(i, nid) {
+                start = early_node_map[i].start_pfn;
+                end = early_node_map[i].end_pfn;
+                nr_range = add_range(range, az, nr_range, start, end);
+        }
+        return nr_range;
+}
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
@@ -3779,7 +3983,7 @@ static void __init find_usable_zone_for_movable(void)
 /*
 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
 * the starting point for ZONE_MOVABLE is not fixed. It may be different
 * in each node depending on the size of each node and how evenly kernelcore
 * is distributed. This helper function adjusts the zone ranges
@@ -3994,10 +4198,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+                zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                   usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
@@ -4114,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                for_each_lru(l) {
+                for_each_lru(l)
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                        zone->reclaim_stat.nr_saved_scan[l] = 0;
-                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4160,7 +4363,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                        map = alloc_bootmem_node(pgdat, size);
+                        map = alloc_bootmem_node_nopanic(pgdat, size);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4732,15 +4935,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
        dma_reserve = new_dma_reserve;
 }
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
 void __init free_area_init(unsigned long *zones_size)
 {
        free_area_init_node(0, zones_size,
@@ -4934,7 +5128,7 @@ void setup_per_zone_wmarks(void)
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
        unsigned int gb, ratio;
@@ -4948,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
        zone->inactive_ratio = ratio;
 }
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
@@ -4980,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void)
 * 8192MB:      11584k
 * 16384MB:     16384k
 */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
 {
        unsigned long lowmem_kbytes;
@@ -4992,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void)
        if (min_free_kbytes > 65536)
                min_free_kbytes = 65536;
        setup_per_zone_wmarks();
+        refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
        setup_per_zone_inactive_ratio();
        return 0;
@@ -5281,26 +5476,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * page allocater never alloc memory from ISOLATE block.
 */
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+        unsigned long pfn, iter, found;
+        /*
+         * For avoiding noise data, lru_add_drain_all() should be called
+         * If ZONE_MOVABLE, the zone never contains immobile pages
+         */
+        if (zone_idx(zone) == ZONE_MOVABLE)
+                return true;
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+                return true;
+        pfn = page_to_pfn(page);
+        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+                unsigned long check = pfn + iter;
+                if (!pfn_valid_within(check))
+                        continue;
+                page = pfn_to_page(check);
+                if (!page_count(page)) {
+                        if (PageBuddy(page))
+                                iter += (1 << page_order(page)) - 1;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        found++;
+                /*
+                 * If there are RECLAIMABLE pages, we need to check it.
+                 * But now, memory offline itself doesn't call shrink_slab()
+                 * and it still to be fixed.
+                 */
+                /*
+                 * If the page is not RAM, page_count()should be 0.
+                 * we don't need more check. This is an _used_ not-movable page.
+                 *
+                 * The problematic thing here is PG_reserved pages. PG_reserved
+                 * is set to both of a memory hole page and a _used_ kernel
+                 * page at boot.
+                 */
+                if (found > count)
+                        return false;
+        }
+        return true;
+}
+bool is_pageblock_removable_nolock(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        return __count_immobile_pages(zone, page, 0);
+}
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        struct page *curr_page;
+        unsigned long flags, pfn;
-        unsigned long flags, pfn, iter;
-        unsigned long immobile = 0;
        struct memory_isolate_notify arg;
        int notifier_ret;
        int ret = -EBUSY;
-        int zone_idx;
        zone = page_zone(page);
-        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-            zone_idx == ZONE_MOVABLE) {
-                ret = 0;
-                goto out;
-        }
        pfn = page_to_pfn(page);
        arg.start_pfn = pfn;
@@ -5320,23 +5560,20 @@ int set_migratetype_isolate(struct page *page)
         */
        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret || !arg.pages_found)
+        if (notifier_ret)
                goto out;
+        /*
-        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-                if (!pfn_valid_within(pfn))
+         * We just check MOVABLE pages.
-                        continue;
+         */
+        if (__count_immobile_pages(zone, page, arg.pages_found))
-                curr_page = pfn_to_page(iter);
-                if (!page_count(curr_page) || PageLRU(curr_page))
-                        continue;
-                immobile++;
-        }
-        if (arg.pages_found == immobile)
                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
 out:
        if (!ret) {
                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5455,7 +5692,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5503,7 +5739,8 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, page_count(page), page_mapcount(page),
+                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
+        mem_cgroup_print_bad_page(page);
 }