1 files changed, 219 insertions, 48 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a6326c71b663..08b349931ebc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/memory.h>
+#include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page,
                int migratetype)
 {
        unsigned long page_idx;
+        unsigned long combined_idx;
+        struct page *buddy;
        if (unlikely(PageCompound(page)))
                if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                unsigned long combined_idx;
-                struct page *buddy;
                buddy = __page_find_buddy(page, page_idx, order);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page,
                order++;
        }
        set_page_order(page, order);
-        list_add(&page->lru,
-                &zone->free_area[order].free_list[migratetype]);
+        /*
+         * If this is not the largest possible page, check if the buddy
+         * of the next-highest order is free. If it is, it's possible
+         * that pages are being freed that will coalesce soon. In case,
+         * that is happening, add the free page to the tail of the list
+         * so it's less likely to be used soon and more likely to be merged
+         * as a higher order page
+         */
+        if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+                struct page *higher_page, *higher_buddy;
+                combined_idx = __find_combined_index(page_idx, order);
+                higher_page = page + combined_idx - page_idx;
+                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
+                        list_add_tail(&page->lru,
+                                &zone->free_area[order].free_list[migratetype]);
+                        goto out;
+                }
+        }
+        list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
+out:
        zone->free_area[order].nr_free++;
 }
@@ -599,20 +620,23 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
        spin_unlock(&zone->lock);
 }
-static void __free_pages_ok(struct page *page, unsigned int order)
+static bool free_pages_prepare(struct page *page, unsigned int order)
 {
-        unsigned long flags;
        int i;
        int bad = 0;
-        int wasMlocked = __TestClearPageMlocked(page);
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0 ; i < (1 << order) ; ++i)
+        for (i = 0; i < (1 << order); i++) {
-                bad += free_pages_check(page + i);
+                struct page *pg = page + i;
+                if (PageAnon(pg))
+                        pg->mapping = NULL;
+                bad += free_pages_check(pg);
+        }
        if (bad)
-                return;
+                return false;
        if (!PageHighMem(page)) {
                debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        arch_free_page(page, order);
        kernel_map_pages(page, 1 << order, 0);
+        return true;
+}
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+        unsigned long flags;
+        int wasMlocked = __TestClearPageMlocked(page);
+        if (!free_pages_prepare(page, order))
+                return;
        local_irq_save(flags);
        if (unlikely(wasMlocked))
                free_page_mlock(page);
@@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold)
        int migratetype;
        int wasMlocked = __TestClearPageMlocked(page);
-        trace_mm_page_free_direct(page, 0);
+        if (!free_pages_prepare(page, 0))
-        kmemcheck_free_shadow(page, 0);
-        if (PageAnon(page))
-                page->mapping = NULL;
-        if (free_pages_check(page))
                return;
-        if (!PageHighMem(page)) {
-                debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
-                debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
-        }
-        arch_free_page(page, 0);
-        kernel_map_pages(page, 1, 0);
        migratetype = get_pageblock_migratetype(page);
        set_page_private(page, migratetype);
        local_irq_save(flags);
@@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order)
 }
 /*
+ * Similar to split_page except the page is already free. As this is only
+ * being used for migration, the migratetype of the block also changes.
+ * As this is called with interrupts disabled, the caller is responsible
+ * for calling arch_alloc_page() and kernel_map_page() after interrupts
+ * are enabled.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+int split_free_page(struct page *page)
+{
+        unsigned int order;
+        unsigned long watermark;
+        struct zone *zone;
+        BUG_ON(!PageBuddy(page));
+        zone = page_zone(page);
+        order = page_order(page);
+        /* Obey watermarks as if the page was being allocated */
+        watermark = low_wmark_pages(zone) + (1 << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return 0;
+        /* Remove page from free list */
+        list_del(&page->lru);
+        zone->free_area[order].nr_free--;
+        rmv_page_order(page);
+        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
+        /* Split into individual pages */
+        set_page_refcounted(page);
+        split_page(page, order);
+        if (order >= pageblock_order - 1) {
+                struct page *endpage = page + (1 << order) - 1;
+                for (; page < endpage; page += pageblock_nr_pages)
+                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+        }
+        return 1 << order;
+}
+/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
@@ -1693,6 +1761,62 @@ out:
        return page;
 }
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+        int migratetype, unsigned long *did_some_progress)
+{
+        struct page *page;
+        if (!order || compaction_deferred(preferred_zone))
+                return NULL;
+        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+                                                                nodemask);
+        if (*did_some_progress != COMPACT_SKIPPED) {
+                /* Page migration frees to the PCP lists but we want merging */
+                drain_pages(get_cpu());
+                put_cpu();
+                page = get_page_from_freelist(gfp_mask, nodemask,
+                                order, zonelist, high_zoneidx,
+                                alloc_flags, preferred_zone,
+                                migratetype);
+                if (page) {
+                        preferred_zone->compact_considered = 0;
+                        preferred_zone->compact_defer_shift = 0;
+                        count_vm_event(COMPACTSUCCESS);
+                        return page;
+                }
+                /*
+                 * It's bad if compaction run occurs and fails.
+                 * The most likely reason is that pages exist,
+                 * but not enough to satisfy watermarks.
+                 */
+                count_vm_event(COMPACTFAIL);
+                defer_compaction(preferred_zone);
+                cond_resched();
+        }
+        return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+        int migratetype, unsigned long *did_some_progress)
+{
+        return NULL;
+}
+#endif /* CONFIG_COMPACTION */
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2003,15 @@ rebalance:
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
+        /* Try direct compaction */
+        page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress);
+        if (page)
+                goto got_pg;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        get_mems_allowed();
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
-        if (!preferred_zone)
+        if (!preferred_zone) {
+                put_mems_allowed();
                return NULL;
+        }
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
+        put_mems_allowed();
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
        return page;
@@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                        strncpy((char*)table->data, saved_string,
                                NUMA_ZONELIST_ORDER_LEN);
                        user_zonelist_order = oldval;
-                } else if (oldval != user_zonelist_order)
+                } else if (oldval != user_zonelist_order) {
-                        build_all_zonelists();
+                        mutex_lock(&zonelists_mutex);
+                        build_all_zonelists(NULL);
+                        mutex_unlock(&zonelists_mutex);
+                }
        }
 out:
        mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2722,7 @@ static int default_zonelist_order(void)
         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
         * If they are really small and used heavily, the system can fall
         * into OOM very easily.
-         * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
+         * This function detect ZONE_DMA/DMA32 size and configures zone order.
         */
        /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
        low_kmem_size = 0;
@@ -2594,6 +2734,15 @@ static int default_zonelist_order(void)
                                if (zone_type < ZONE_NORMAL)
                                        low_kmem_size += z->present_pages;
                                total_size += z->present_pages;
+                        } else if (zone_type == ZONE_NORMAL) {
+                                /*
+                                 * If any node has only lowmem, then node order
+                                 * is preferred to allow kernel allocations
+                                 * locally; otherwise, they can easily infringe
+                                 * on other nodes when there is an abundance of
+                                 * lowmem available to allocate from.
+                                 */
+                                return ZONELIST_ORDER_NODE;
                        }
                }
        }
@@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 */
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
+static void setup_zone_pageset(struct zone *zone);
+/*
+ * Global mutex to protect against size modification of zonelists
+ * as well as to serialize pageset setup for the new populated zone.
+ */
+DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *dummy)
+static __init_refok int __build_all_zonelists(void *data)
 {
        int nid;
        int cpu;
@@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy)
                build_zonelist_cache(pgdat);
        }
+#ifdef CONFIG_MEMORY_HOTPLUG
+        /* Setup real pagesets for the new zone */
+        if (data) {
+                struct zone *zone = data;
+                setup_zone_pageset(zone);
+        }
+#endif
        /*
         * Initialize the boot_pagesets that are going to be used
         * for bootstrapping processors. The real pagesets for
@@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy)
        return 0;
 }
-void build_all_zonelists(void)
+/*
+ * Called with zonelists_mutex held always
+ * unless system_state == SYSTEM_BOOTING.
+ */
+void build_all_zonelists(void *data)
 {
        set_zonelist_order();
@@ -2823,7 +2991,7 @@ void build_all_zonelists(void)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine(__build_all_zonelists, NULL, NULL);
+                stop_machine(__build_all_zonelists, data, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
+static __meminit void setup_zone_pageset(struct zone *zone)
+{
+        int cpu;
+        zone->pageset = alloc_percpu(struct per_cpu_pageset);
+        for_each_possible_cpu(cpu) {
+                struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+                setup_pageset(pcp, zone_batchsize(zone));
+                if (percpu_pagelist_fraction)
+                        setup_pagelist_highmark(pcp,
+                                (zone->present_pages /
+                                        percpu_pagelist_fraction));
+        }
+}
 /*
 * Allocate per cpu pagesets and initialize them.
 * Before this call only boot pagesets were available.
- * Boot pagesets will no longer be used by this processorr
- * after setup_per_cpu_pageset().
 */
 void __init setup_per_cpu_pageset(void)
 {
        struct zone *zone;
-        int cpu;
-        for_each_populated_zone(zone) {
+        for_each_populated_zone(zone)
-                zone->pageset = alloc_percpu(struct per_cpu_pageset);
+                setup_zone_pageset(zone);
-                for_each_possible_cpu(cpu) {
-                        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-                        setup_pageset(pcp, zone_batchsize(zone));
-                        if (percpu_pagelist_fraction)
-                                setup_pagelist_highmark(pcp,
-                                        (zone->present_pages /
-                                                percpu_pagelist_fraction));
-                }
-        }
 }
 static noinline __init_refok

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6326c71b663..08b349931ebc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -49,6 +49,7 @@
49	#include <linux/debugobjects.h>	49	#include <linux/debugobjects.h>
50	#include <linux/kmemleak.h>	50	#include <linux/kmemleak.h>
51	#include <linux/memory.h>	51	#include <linux/memory.h>
		52	#include <linux/compaction.h>
52	#include <trace/events/kmem.h>	53	#include <trace/events/kmem.h>
53	#include <linux/ftrace_event.h>	54	#include <linux/ftrace_event.h>
54		55
@@ -475,6 +476,8 @@ static inline void __free_one_page(struct page *page,
475	int migratetype)	476	int migratetype)
476	{	477	{
477	unsigned long page_idx;	478	unsigned long page_idx;
		479	unsigned long combined_idx;
		480	struct page *buddy;
478		481
479	if (unlikely(PageCompound(page)))	482	if (unlikely(PageCompound(page)))
480	if (unlikely(destroy_compound_page(page, order)))	483	if (unlikely(destroy_compound_page(page, order)))
@@ -488,9 +491,6 @@ static inline void __free_one_page(struct page *page,
488	VM_BUG_ON(bad_range(zone, page));	491	VM_BUG_ON(bad_range(zone, page));
489		492
490	while (order < MAX_ORDER-1) {	493	while (order < MAX_ORDER-1) {
491	unsigned long combined_idx;
492	struct page *buddy;
493
494	buddy = __page_find_buddy(page, page_idx, order);	494	buddy = __page_find_buddy(page, page_idx, order);
495	if (!page_is_buddy(page, buddy, order))	495	if (!page_is_buddy(page, buddy, order))
496	break;	496	break;
@@ -505,8 +505,29 @@ static inline void __free_one_page(struct page *page,
505	order++;	505	order++;
506	}	506	}
507	set_page_order(page, order);	507	set_page_order(page, order);
508	list_add(&page->lru,	508
509	&zone->free_area[order].free_list[migratetype]);	509	/*
		510	* If this is not the largest possible page, check if the buddy
		511	* of the next-highest order is free. If it is, it's possible
		512	* that pages are being freed that will coalesce soon. In case,
		513	* that is happening, add the free page to the tail of the list
		514	* so it's less likely to be used soon and more likely to be merged
		515	* as a higher order page
		516	*/
		517	if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
		518	struct page higher_page, higher_buddy;
		519	combined_idx = __find_combined_index(page_idx, order);
		520	higher_page = page + combined_idx - page_idx;
		521	higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
		522	if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
		523	list_add_tail(&page->lru,
		524	&zone->free_area[order].free_list[migratetype]);
		525	goto out;
		526	}
		527	}
		528
		529	list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
		530	out:
510	zone->free_area[order].nr_free++;	531	zone->free_area[order].nr_free++;
511	}	532	}
512		533
@@ -599,20 +620,23 @@ static void free_one_page(struct zone zone, struct page page, int order,
599	spin_unlock(&zone->lock);	620	spin_unlock(&zone->lock);
600	}	621	}
601		622
602	static void __free_pages_ok(struct page *page, unsigned int order)	623	static bool free_pages_prepare(struct page *page, unsigned int order)
603	{	624	{
604	unsigned long flags;
605	int i;	625	int i;
606	int bad = 0;	626	int bad = 0;
607	int wasMlocked = __TestClearPageMlocked(page);
608		627
609	trace_mm_page_free_direct(page, order);	628	trace_mm_page_free_direct(page, order);
610	kmemcheck_free_shadow(page, order);	629	kmemcheck_free_shadow(page, order);
611		630
612	for (i = 0 ; i < (1 << order) ; ++i)	631	for (i = 0; i < (1 << order); i++) {
613	bad += free_pages_check(page + i);	632	struct page *pg = page + i;
		633
		634	if (PageAnon(pg))
		635	pg->mapping = NULL;
		636	bad += free_pages_check(pg);
		637	}
614	if (bad)	638	if (bad)
615	return;	639	return false;
616		640
617	if (!PageHighMem(page)) {	641	if (!PageHighMem(page)) {
618	debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);	642	debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
@@ -622,6 +646,17 @@ static void __free_pages_ok(struct page *page, unsigned int order)
622	arch_free_page(page, order);	646	arch_free_page(page, order);
623	kernel_map_pages(page, 1 << order, 0);	647	kernel_map_pages(page, 1 << order, 0);
624		648
		649	return true;
		650	}
		651
		652	static void __free_pages_ok(struct page *page, unsigned int order)
		653	{
		654	unsigned long flags;
		655	int wasMlocked = __TestClearPageMlocked(page);
		656
		657	if (!free_pages_prepare(page, order))
		658	return;
		659
625	local_irq_save(flags);	660	local_irq_save(flags);
626	if (unlikely(wasMlocked))	661	if (unlikely(wasMlocked))
627	free_page_mlock(page);	662	free_page_mlock(page);
@@ -1107,21 +1142,9 @@ void free_hot_cold_page(struct page *page, int cold)
1107	int migratetype;	1142	int migratetype;
1108	int wasMlocked = __TestClearPageMlocked(page);	1143	int wasMlocked = __TestClearPageMlocked(page);
1109		1144
1110	trace_mm_page_free_direct(page, 0);	1145	if (!free_pages_prepare(page, 0))
1111	kmemcheck_free_shadow(page, 0);
1112
1113	if (PageAnon(page))
1114	page->mapping = NULL;
1115	if (free_pages_check(page))
1116	return;	1146	return;
1117		1147
1118	if (!PageHighMem(page)) {
1119	debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
1120	debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
1121	}
1122	arch_free_page(page, 0);
1123	kernel_map_pages(page, 1, 0);
1124
1125	migratetype = get_pageblock_migratetype(page);	1148	migratetype = get_pageblock_migratetype(page);
1126	set_page_private(page, migratetype);	1149	set_page_private(page, migratetype);
1127	local_irq_save(flags);	1150	local_irq_save(flags);
@@ -1188,6 +1211,51 @@ void split_page(struct page *page, unsigned int order)
1188	}	1211	}
1189		1212
1190	/*	1213	/*
		1214	* Similar to split_page except the page is already free. As this is only
		1215	* being used for migration, the migratetype of the block also changes.
		1216	* As this is called with interrupts disabled, the caller is responsible
		1217	* for calling arch_alloc_page() and kernel_map_page() after interrupts
		1218	* are enabled.
		1219	*
		1220	* Note: this is probably too low level an operation for use in drivers.
		1221	* Please consult with lkml before using this in your driver.
		1222	*/
		1223	int split_free_page(struct page *page)
		1224	{
		1225	unsigned int order;
		1226	unsigned long watermark;
		1227	struct zone *zone;
		1228
		1229	BUG_ON(!PageBuddy(page));
		1230
		1231	zone = page_zone(page);
		1232	order = page_order(page);
		1233
		1234	/* Obey watermarks as if the page was being allocated */
		1235	watermark = low_wmark_pages(zone) + (1 << order);
		1236	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
		1237	return 0;
		1238
		1239	/* Remove page from free list */
		1240	list_del(&page->lru);
		1241	zone->free_area[order].nr_free--;
		1242	rmv_page_order(page);
		1243	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
		1244
		1245	/* Split into individual pages */
		1246	set_page_refcounted(page);
		1247	split_page(page, order);
		1248
		1249	if (order >= pageblock_order - 1) {
		1250	struct page *endpage = page + (1 << order) - 1;
		1251	for (; page < endpage; page += pageblock_nr_pages)
		1252	set_pageblock_migratetype(page, MIGRATE_MOVABLE);
		1253	}
		1254
		1255	return 1 << order;
		1256	}
		1257
		1258	/*
1191	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But	1259	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
1192	* we cheat by calling it from here, in the order > 0 path. Saves a branch	1260	* we cheat by calling it from here, in the order > 0 path. Saves a branch
1193	* or two.	1261	* or two.
@@ -1693,6 +1761,62 @@ out:
1693	return page;	1761	return page;
1694	}	1762	}
1695		1763
		1764	#ifdef CONFIG_COMPACTION
		1765	/* Try memory compaction for high-order allocations before reclaim */
		1766	static struct page *
		1767	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		1768	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1769	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,
		1770	int migratetype, unsigned long *did_some_progress)
		1771	{
		1772	struct page *page;
		1773
		1774	if (!order \|\| compaction_deferred(preferred_zone))
		1775	return NULL;
		1776
		1777	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
		1778	nodemask);
		1779	if (*did_some_progress != COMPACT_SKIPPED) {
		1780
		1781	/* Page migration frees to the PCP lists but we want merging */
		1782	drain_pages(get_cpu());
		1783	put_cpu();
		1784
		1785	page = get_page_from_freelist(gfp_mask, nodemask,
		1786	order, zonelist, high_zoneidx,
		1787	alloc_flags, preferred_zone,
		1788	migratetype);
		1789	if (page) {
		1790	preferred_zone->compact_considered = 0;
		1791	preferred_zone->compact_defer_shift = 0;
		1792	count_vm_event(COMPACTSUCCESS);
		1793	return page;
		1794	}
		1795
		1796	/*
		1797	* It's bad if compaction run occurs and fails.
		1798	* The most likely reason is that pages exist,
		1799	* but not enough to satisfy watermarks.
		1800	*/
		1801	count_vm_event(COMPACTFAIL);
		1802	defer_compaction(preferred_zone);
		1803
		1804	cond_resched();
		1805	}
		1806
		1807	return NULL;
		1808	}
		1809	#else
		1810	static inline struct page *
		1811	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
		1812	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1813	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,
		1814	int migratetype, unsigned long *did_some_progress)
		1815	{
		1816	return NULL;
		1817	}
		1818	#endif /* CONFIG_COMPACTION */
		1819
1696	/* The really slow allocator path where we enter direct reclaim */	1820	/* The really slow allocator path where we enter direct reclaim */
1697	static inline struct page *	1821	static inline struct page *
1698	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,	1822	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1879,6 +2003,15 @@ rebalance:
1879	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))	2003	if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1880	goto nopage;	2004	goto nopage;
1881		2005
		2006	/* Try direct compaction */
		2007	page = __alloc_pages_direct_compact(gfp_mask, order,
		2008	zonelist, high_zoneidx,
		2009	nodemask,
		2010	alloc_flags, preferred_zone,
		2011	migratetype, &did_some_progress);
		2012	if (page)
		2013	goto got_pg;
		2014
1882	/* Try direct reclaim and then allocating */	2015	/* Try direct reclaim and then allocating */
1883	page = __alloc_pages_direct_reclaim(gfp_mask, order,	2016	page = __alloc_pages_direct_reclaim(gfp_mask, order,
1884	zonelist, high_zoneidx,	2017	zonelist, high_zoneidx,
@@ -1970,10 +2103,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1970	if (unlikely(!zonelist->_zonerefs->zone))	2103	if (unlikely(!zonelist->_zonerefs->zone))
1971	return NULL;	2104	return NULL;
1972		2105
		2106	get_mems_allowed();
1973	/* The preferred zone is used for statistics later */	2107	/* The preferred zone is used for statistics later */
1974	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);	2108	first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1975	if (!preferred_zone)	2109	if (!preferred_zone) {
		2110	put_mems_allowed();
1976	return NULL;	2111	return NULL;
		2112	}
1977		2113
1978	/* First allocation attempt */	2114	/* First allocation attempt */
1979	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,	2115	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,
@@ -1983,6 +2119,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1983	page = __alloc_pages_slowpath(gfp_mask, order,	2119	page = __alloc_pages_slowpath(gfp_mask, order,
1984	zonelist, high_zoneidx, nodemask,	2120	zonelist, high_zoneidx, nodemask,
1985	preferred_zone, migratetype);	2121	preferred_zone, migratetype);
		2122	put_mems_allowed();
1986		2123
1987	trace_mm_page_alloc(page, order, gfp_mask, migratetype);	2124	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
1988	return page;	2125	return page;
@@ -2434,8 +2571,11 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2434	strncpy((char*)table->data, saved_string,	2571	strncpy((char*)table->data, saved_string,
2435	NUMA_ZONELIST_ORDER_LEN);	2572	NUMA_ZONELIST_ORDER_LEN);
2436	user_zonelist_order = oldval;	2573	user_zonelist_order = oldval;
2437	} else if (oldval != user_zonelist_order)	2574	} else if (oldval != user_zonelist_order) {
2438	build_all_zonelists();	2575	mutex_lock(&zonelists_mutex);
		2576	build_all_zonelists(NULL);
		2577	mutex_unlock(&zonelists_mutex);
		2578	}
2439	}	2579	}
2440	out:	2580	out:
2441	mutex_unlock(&zl_order_mutex);	2581	mutex_unlock(&zl_order_mutex);
@@ -2582,7 +2722,7 @@ static int default_zonelist_order(void)
2582	* ZONE_DMA and ZONE_DMA32 can be very small area in the system.	2722	* ZONE_DMA and ZONE_DMA32 can be very small area in the system.
2583	* If they are really small and used heavily, the system can fall	2723	* If they are really small and used heavily, the system can fall
2584	* into OOM very easily.	2724	* into OOM very easily.
2585	* This function detect ZONE_DMA/DMA32 size and confgigures zone order.	2725	* This function detect ZONE_DMA/DMA32 size and configures zone order.
2586	*/	2726	*/
2587	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */	2727	/* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
2588	low_kmem_size = 0;	2728	low_kmem_size = 0;
@@ -2594,6 +2734,15 @@ static int default_zonelist_order(void)
2594	if (zone_type < ZONE_NORMAL)	2734	if (zone_type < ZONE_NORMAL)
2595	low_kmem_size += z->present_pages;	2735	low_kmem_size += z->present_pages;
2596	total_size += z->present_pages;	2736	total_size += z->present_pages;
		2737	} else if (zone_type == ZONE_NORMAL) {
		2738	/*
		2739	* If any node has only lowmem, then node order
		2740	* is preferred to allow kernel allocations
		2741	* locally; otherwise, they can easily infringe
		2742	* on other nodes when there is an abundance of
		2743	* lowmem available to allocate from.
		2744	*/
		2745	return ZONELIST_ORDER_NODE;
2597	}	2746	}
2598	}	2747	}
2599	}	2748	}
@@ -2776,9 +2925,16 @@ static void build_zonelist_cache(pg_data_t *pgdat)
2776	*/	2925	*/
2777	static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);	2926	static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
2778	static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);	2927	static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
		2928	static void setup_zone_pageset(struct zone *zone);
		2929
		2930	/*
		2931	* Global mutex to protect against size modification of zonelists
		2932	* as well as to serialize pageset setup for the new populated zone.
		2933	*/
		2934	DEFINE_MUTEX(zonelists_mutex);
2779		2935
2780	/* return values int ....just for stop_machine() */	2936	/* return values int ....just for stop_machine() */
2781	static int __build_all_zonelists(void *dummy)	2937	static __init_refok int __build_all_zonelists(void *data)
2782	{	2938	{
2783	int nid;	2939	int nid;
2784	int cpu;	2940	int cpu;
@@ -2793,6 +2949,14 @@ static int __build_all_zonelists(void *dummy)
2793	build_zonelist_cache(pgdat);	2949	build_zonelist_cache(pgdat);
2794	}	2950	}
2795		2951
		2952	#ifdef CONFIG_MEMORY_HOTPLUG
		2953	/* Setup real pagesets for the new zone */
		2954	if (data) {
		2955	struct zone *zone = data;
		2956	setup_zone_pageset(zone);
		2957	}
		2958	#endif
		2959
2796	/*	2960	/*
2797	* Initialize the boot_pagesets that are going to be used	2961	* Initialize the boot_pagesets that are going to be used
2798	* for bootstrapping processors. The real pagesets for	2962	* for bootstrapping processors. The real pagesets for
@@ -2812,7 +2976,11 @@ static int __build_all_zonelists(void *dummy)
2812	return 0;	2976	return 0;
2813	}	2977	}
2814		2978
2815	void build_all_zonelists(void)	2979	/*
		2980	* Called with zonelists_mutex held always
		2981	* unless system_state == SYSTEM_BOOTING.
		2982	*/
		2983	void build_all_zonelists(void *data)
2816	{	2984	{
2817	set_zonelist_order();	2985	set_zonelist_order();
2818		2986
@@ -2823,7 +2991,7 @@ void build_all_zonelists(void)
2823	} else {	2991	} else {
2824	/* we have to stop all cpus to guarantee there is no user	2992	/* we have to stop all cpus to guarantee there is no user
2825	of zonelist */	2993	of zonelist */
2826	stop_machine(__build_all_zonelists, NULL, NULL);	2994	stop_machine(__build_all_zonelists, data, NULL);
2827	/* cpuset refresh routine should be here */	2995	/* cpuset refresh routine should be here */
2828	}	2996	}
2829	vm_total_pages = nr_free_pagecache_pages();	2997	vm_total_pages = nr_free_pagecache_pages();
@@ -3146,31 +3314,34 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3146	pcp->batch = PAGE_SHIFT * 8;	3314	pcp->batch = PAGE_SHIFT * 8;
3147	}	3315	}
3148		3316
		3317	static __meminit void setup_zone_pageset(struct zone *zone)
		3318	{
		3319	int cpu;
		3320
		3321	zone->pageset = alloc_percpu(struct per_cpu_pageset);
		3322
		3323	for_each_possible_cpu(cpu) {
		3324	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
		3325
		3326	setup_pageset(pcp, zone_batchsize(zone));
		3327
		3328	if (percpu_pagelist_fraction)
		3329	setup_pagelist_highmark(pcp,
		3330	(zone->present_pages /
		3331	percpu_pagelist_fraction));
		3332	}
		3333	}
		3334
3149	/*	3335	/*
3150	* Allocate per cpu pagesets and initialize them.	3336	* Allocate per cpu pagesets and initialize them.
3151	* Before this call only boot pagesets were available.	3337	* Before this call only boot pagesets were available.
3152	* Boot pagesets will no longer be used by this processorr
3153	* after setup_per_cpu_pageset().
3154	*/	3338	*/
3155	void __init setup_per_cpu_pageset(void)	3339	void __init setup_per_cpu_pageset(void)
3156	{	3340	{
3157	struct zone *zone;	3341	struct zone *zone;
3158	int cpu;
3159		3342
3160	for_each_populated_zone(zone) {	3343	for_each_populated_zone(zone)
3161	zone->pageset = alloc_percpu(struct per_cpu_pageset);	3344	setup_zone_pageset(zone);
3162
3163	for_each_possible_cpu(cpu) {
3164	struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
3165
3166	setup_pageset(pcp, zone_batchsize(zone));
3167
3168	if (percpu_pagelist_fraction)
3169	setup_pagelist_highmark(pcp,
3170	(zone->present_pages /
3171	percpu_pagelist_fraction));
3172	}
3173	}
3174	}	3345	}
3175		3346
3176	static noinline __init_refok	3347	static noinline __init_refok