1 files changed, 156 insertions, 103 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff7e1587239..bd7625676a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-        unsigned long buddy_idx = page_idx ^ (1 << order);
-        return page + (buddy_idx - page_idx);
-}
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-        return (page_idx & ~(1 << order));
+        return page_idx ^ (1 << order);
 }
 /*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we use PG_buddy.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                buddy = __page_find_buddy(page, page_idx, order);
+                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
         */
        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
-                higher_page = page + combined_idx - page_idx;
+                higher_page = page + (combined_idx - page_idx);
-                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0; i < (1 << order); i++) {
+        if (PageAnon(page))
-                struct page *pg = page + i;
+                page->mapping = NULL;
+        for (i = 0; i < (1 << order); i++)
-                if (PageAnon(pg))
+                bad += free_pages_check(page + i);
-                        pg->mapping = NULL;
-                bad += free_pages_check(pg);
-        }
        if (bad)
                return false;
@@ -1095,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -1460,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -1793,15 +1807,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        struct page *page;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                                nodemask);
+                                                nodemask, sync_migration);
+        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1854,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        return NULL;
 }
@@ -1852,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
        bool drained = false;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        p->flags |= PF_MEMALLOC;
+        current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
+        current->reclaim_state = &reclaim_state;
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
-        p->reclaim_state = NULL;
+        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
+        current->flags &= ~PF_MEMALLOC;
        cond_resched();
@@ -1920,19 +1937,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx)
+                                                enum zone_type high_zoneidx,
+                                                enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(zone, order);
+                wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-        struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
@@ -1948,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
-                alloc_flags |= ALLOC_HARDER;
+                /*
+                 * Not worth trying to allocate harder for
+                 * __GFP_NOMEMALLOC even if it can't schedule.
+                 */
+                if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)) && !in_interrupt())
+        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                    ((p->flags & PF_MEMALLOC) ||
+                    ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1978,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        struct task_struct *p = current;
+        bool sync_migration = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx);
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2012,6 +2036,14 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2034,21 +2066,26 @@ rebalance:
                goto nopage;
        /* Avoid recursion of direct reclaim */
-        if (p->flags & PF_MEMALLOC)
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
-        /* Try direct compaction */
+        /*
+         * Try direct compaction. The first pass is asynchronous. Subsequent
+         * attempts after direct reclaim are synchronous
+         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
        if (page)
                goto got_pg;
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2139,27 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
+                if (page)
+                        goto got_pg;
        }
 nopage:
        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
                printk(KERN_WARNING "%s: page allocation failure."
                        " order:%d, mode:0x%x\n",
-                        p->comm, order, gfp_mask);
+                        current->comm, order, gfp_mask);
                dump_stack();
                show_mem();
        }
@@ -2151,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -2442,7 +2495,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2585,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        if (s)
+        int ret;
-                return __parse_numa_zonelist_order(s);
-        return 0;
+        if (!s)
+                return 0;
+        ret = __parse_numa_zonelist_order(s);
+        if (ret == 0)
+                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -3639,13 +3699,45 @@ void __init free_bootmem_with_active_regions(int nid,
 }
 #ifdef CONFIG_HAVE_MEMBLOCK
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
+{
+        int i;
+        for (i = nr_nodemap_entries - 1; i >= 0; i--)
+                if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
+                        return i;
+        return -1;
+}
+/*
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+        for (index = index - 1; index >= 0; index--)
+                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                        return index;
+        return -1;
+}
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+        for (i = last_active_region_index_in_nid(nid); i != -1; \
+                                i = previous_active_region_index_in_nid(i, nid))
 u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
 {
        int i;
        /* Need to go over early_node_map to find out good range for node */
-        for_each_active_range_index_in_nid(i, nid) {
+        for_each_active_range_index_in_nid_reverse(i, nid) {
                u64 addr;
                u64 ei_start, ei_last;
                u64 final_start, final_end;
@@ -3688,34 +3780,6 @@ int __init add_from_early_node_map(struct range *range, int az,
        return nr_range;
 }
-#ifdef CONFIG_NO_BOOTMEM
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
-                                        u64 goal, u64 limit)
-{
-        void *ptr;
-        u64 addr;
-        if (limit > memblock.current_limit)
-                limit = memblock.current_limit;
-        addr = find_memory_core_early(nid, size, align, goal, limit);
-        if (addr == MEMBLOCK_ERROR)
-                return NULL;
-        ptr = phys_to_virt(addr);
-        memset(ptr, 0, size);
-        memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
-        /*
-         * The min_count is set to 0 so that bootmem allocated blocks
-         * are never reported as leaks.
-         */
-        kmemleak_alloc(ptr, size, 0, 0);
-        return ptr;
-}
-#endif
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
        int i;
@@ -4014,7 +4078,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
@@ -4749,15 +4813,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
        dma_reserve = new_dma_reserve;
 }
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
 void __init free_area_init(unsigned long *zones_size)
 {
        free_area_init_node(0, zones_size,
@@ -5316,10 +5371,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
                unsigned long check = pfn + iter;
-                if (!pfn_valid_within(check)) {
+                if (!pfn_valid_within(check))
-                        iter++;
                        continue;
-                }
                page = pfn_to_page(check);
                if (!page_count(page)) {
                        if (PageBuddy(page))
@@ -5517,7 +5571,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5565,7 +5618,7 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, page_count(page), page_mapcount(page),
+                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
 }