1 files changed, 190 insertions, 63 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1b..794e6715c226 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
        saved_gfp_mask = gfp_allowed_mask;
        gfp_allowed_mask &= ~GFP_IOFS;
 }
+bool pm_suspended_storage(void)
+{
+        if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+                return false;
+        return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+        unsigned long res;
+        if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+                printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+                return 0;
+        }
+        _debug_guardpage_minorder = res;
+        printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+        return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+static inline void set_page_guard_flag(struct page *page)
+{
+        __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+static inline void clear_page_guard_flag(struct page *page)
+{
+        __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
        if (page_zone_id(page) != page_zone_id(buddy))
                return 0;
+        if (page_is_guard(buddy) && page_order(buddy) == order) {
+                VM_BUG_ON(page_count(buddy) != 0);
+                return 1;
+        }
        if (PageBuddy(buddy) && page_order(buddy) == order) {
                VM_BUG_ON(page_count(buddy) != 0);
                return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
+                /*
-                /* Our buddy is free, merge with it and move up one order. */
+                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
-                list_del(&buddy->lru);
+                 * merge with it and move up one order.
-                zone->free_area[order].nr_free--;
+                 */
-                rmv_page_order(buddy);
+                if (page_is_guard(buddy)) {
+                        clear_page_guard_flag(buddy);
+                        set_page_private(page, 0);
+                        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+                } else {
+                        list_del(&buddy->lru);
+                        zone->free_area[order].nr_free--;
+                        rmv_page_order(buddy);
+                }
                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        int i;
        int bad = 0;
-        trace_mm_page_free_direct(page, order);
+        trace_mm_page_free(page, order);
        kmemcheck_free_shadow(page, order);
        if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        local_irq_restore(flags);
 }
-/*
- * permit the bootmem allocator to evade page validation on high-order frees
- */
 void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
-        if (order == 0) {
+        unsigned int nr_pages = 1 << order;
-                __ClearPageReserved(page);
+        unsigned int loop;
-                set_page_count(page, 0);
-                set_page_refcounted(page);
-                __free_page(page);
-        } else {
-                int loop;
-                prefetchw(page);
-                for (loop = 0; loop < (1 << order); loop++) {
-                        struct page *p = &page[loop];
-                        if (loop + 1 < (1 << order))
+        prefetchw(page);
-                                prefetchw(p + 1);
+        for (loop = 0; loop < nr_pages; loop++) {
-                        __ClearPageReserved(p);
+                struct page *p = &page[loop];
-                        set_page_count(p, 0);
-                }
-                set_page_refcounted(page);
+                if (loop + 1 < nr_pages)
-                __free_pages(page, order);
+                        prefetchw(p + 1);
+                __ClearPageReserved(p);
+                set_page_count(p, 0);
        }
+        set_page_refcounted(page);
+        __free_pages(page, order);
 }
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
+#ifdef CONFIG_DEBUG_PAGEALLOC
+                if (high < debug_guardpage_minorder()) {
+                        /*
+                         * Mark as guard pages (or page), that will allow to
+                         * merge back to allocator when buddy will be freed.
+                         * Corresponding page table entries will not be touched,
+                         * pages will stay not present in virtual address space
+                         */
+                        INIT_LIST_HEAD(&page[size].lru);
+                        set_page_guard_flag(&page[size]);
+                        set_page_private(&page[size], high);
+                        /* Guard pages are not available for any usage */
+                        __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+                        continue;
+                }
+#endif
                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
 }
 /*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+        struct page *page, *next;
+        list_for_each_entry_safe(page, next, list, lru) {
+                trace_mm_page_free_batched(page, cold);
+                free_hot_cold_page(page, cold);
+        }
+}
+/*
 * split_page takes a non-compound higher-order page, and splits it into
 * n (1<<order) sub-pages: page[0..n]
 * Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        long min = mark;
        int o;
-        free_pages -= (1 << order) + 1;
+        free_pages -= (1 << order) - 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                continue;
+                /*
+                 * When allocating a page cache page for writing, we
+                 * want to get it from a zone that is within its dirty
+                 * limit, such that no single zone holds more than its
+                 * proportional share of globally allowed dirty pages.
+                 * The dirty limits take into account the zone's
+                 * lowmem reserves and high watermark so that kswapd
+                 * should be able to balance it without having to
+                 * write pages from its LRU list.
+                 *
+                 * This may look like it could increase pressure on
+                 * lower zones by failing allocations in higher zones
+                 * before they are full.  But the pages that do spill
+                 * over are limited as the lower zones are protected
+                 * by this very same mechanism.  It should not become
+                 * a practical burden to them.
+                 *
+                 * XXX: For now, allow allocations to potentially
+                 * exceed the per-zone dirty limit in the slowpath
+                 * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                 * which is important when on a NUMA setup the allowed
+                 * zones are together not big enough to reach the
+                 * global limit.  The proper fix for these situations
+                 * will require awareness of zones in the
+                 * dirty-throttling and the flusher threads.
+                 */
+                if ((alloc_flags & ALLOC_WMARK_LOW) &&
+                    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+                        goto this_zone_full;
                BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
        unsigned int filter = SHOW_MEM_FILTER_NODES;
-        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+            debug_guardpage_minorder() > 0)
                return;
        /*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+                                unsigned long did_some_progress,
                                unsigned long pages_reclaimed)
 {
        /* Do not loop if specifically requested */
        if (gfp_mask & __GFP_NORETRY)
                return 0;
+        /* Always retry if specifically requested */
+        if (gfp_mask & __GFP_NOFAIL)
+                return 1;
+        /*
+         * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+         * making forward progress without invoking OOM. Suspend also disables
+         * storage devices so kswapd will not help. Bail if we are suspending.
+         */
+        if (!did_some_progress && pm_suspended_storage())
+                return 0;
        /*
         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
         * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
                return 1;
-        /*
-         * Don't let big-order allocations loop unless the caller
-         * explicitly requests that.
-         */
-        if (gfp_mask & __GFP_NOFAIL)
-                return 1;
        return 0;
 }
@@ -2196,7 +2313,8 @@ rebalance:
        /* Check if we should retry the allocation */
        pages_reclaimed += did_some_progress;
-        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+        if (should_alloc_retry(gfp_mask, order, did_some_progress,
+                                                pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
-void __pagevec_free(struct pagevec *pvec)
-{
-        int i = pagevec_count(pvec);
-        while (--i >= 0) {
-                trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-                free_hot_cold_page(pvec->pages[i], pvec->cold);
-        }
-}
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                if (page_to_nid(page) != zone_to_nid(zone))
                        continue;
-                /* Blocks with reserved pages will never free, skip them. */
-                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                if (pageblock_is_reserved(pfn, block_end_pfn))
-                        continue;
                block_migratetype = get_pageblock_migratetype(page);
-                /* If this block is reserved, account for it */
+                /* Only test what is necessary when the reserves are not met */
-                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                if (reserve > 0) {
-                        reserve--;
+                        /*
-                        continue;
+                         * Blocks with reserved pages will never free, skip
-                }
+                         * them.
+                         */
+                        block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                        if (pageblock_is_reserved(pfn, block_end_pfn))
+                                continue;
-                /* Suitable for reserving if this block is movable */
+                        /* If this block is reserved, account for it */
-                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        if (block_migratetype == MIGRATE_RESERVE) {
-                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                                reserve--;
-                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                                continue;
-                        reserve--;
+                        }
-                        continue;
+                        /* Suitable for reserving if this block is movable */
+                        if (block_migratetype == MIGRATE_MOVABLE) {
+                                set_pageblock_migratetype(page,
+                                                        MIGRATE_RESERVE);
+                                move_freepages_block(zone, page,
+                                                        MIGRATE_RESERVE);
+                                reserve--;
+                                continue;
+                        }
                }
                /*
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void)
                        if (max > zone->present_pages)
                                max = zone->present_pages;
                        reserve_pages += max;
+                        /*
+                         * Lowmem reserves are not available to
+                         * GFP_HIGHUSER page cache allocations and
+                         * kswapd tries to balance zones to their high
+                         * watermark.  As a result, neither should be
+                         * regarded as dirtyable memory, to prevent a
+                         * situation where reclaim has to clean pages
+                         * in order to balance the zones.
+                         */
+                        zone->dirty_balance_reserve = max;
                }
        }
+        dirty_balance_reserve = reserve_pages;
        totalreserve_pages = reserve_pages;
 }

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1b..794e6715c226 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57	#include <linux/ftrace_event.h>	57	#include <linux/ftrace_event.h>
58	#include <linux/memcontrol.h>	58	#include <linux/memcontrol.h>
59	#include <linux/prefetch.h>	59	#include <linux/prefetch.h>
		60	#include <linux/page-debug-flags.h>
60		61
61	#include <asm/tlbflush.h>	62	#include <asm/tlbflush.h>
62	#include <asm/div64.h>	63	#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
96		97
97	unsigned long totalram_pages __read_mostly;	98	unsigned long totalram_pages __read_mostly;
98	unsigned long totalreserve_pages __read_mostly;	99	unsigned long totalreserve_pages __read_mostly;
		100	/*
		101	* When calculating the number of globally allowed dirty pages, there
		102	* is a certain number of per-zone reserves that should not be
		103	* considered dirtyable memory. This is the sum of those reserves
		104	* over all existing zones that contribute dirtyable memory.
		105	*/
		106	unsigned long dirty_balance_reserve __read_mostly;
		107
99	int percpu_pagelist_fraction;	108	int percpu_pagelist_fraction;
100	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;	109	gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
101		110
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
127	saved_gfp_mask = gfp_allowed_mask;	136	saved_gfp_mask = gfp_allowed_mask;
128	gfp_allowed_mask &= ~GFP_IOFS;	137	gfp_allowed_mask &= ~GFP_IOFS;
129	}	138	}
		139
		140	bool pm_suspended_storage(void)
		141	{
		142	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
		143	return false;
		144	return true;
		145	}
130	#endif /* CONFIG_PM_SLEEP */	146	#endif /* CONFIG_PM_SLEEP */
131		147
132	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE	148	#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
381	clear_highpage(page + i);	397	clear_highpage(page + i);
382	}	398	}
383		399
		400	#ifdef CONFIG_DEBUG_PAGEALLOC
		401	unsigned int _debug_guardpage_minorder;
		402
		403	static int __init debug_guardpage_minorder_setup(char *buf)
		404	{
		405	unsigned long res;
		406
		407	if (kstrtoul(buf, 10, &res) < 0 \|\| res > MAX_ORDER / 2) {
		408	printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
		409	return 0;
		410	}
		411	_debug_guardpage_minorder = res;
		412	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
		413	return 0;
		414	}
		415	__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
		416
		417	static inline void set_page_guard_flag(struct page *page)
		418	{
		419	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
		420	}
		421
		422	static inline void clear_page_guard_flag(struct page *page)
		423	{
		424	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
		425	}
		426	#else
		427	static inline void set_page_guard_flag(struct page *page) { }
		428	static inline void clear_page_guard_flag(struct page *page) { }
		429	#endif
		430
384	static inline void set_page_order(struct page *page, int order)	431	static inline void set_page_order(struct page *page, int order)
385	{	432	{
386	set_page_private(page, order);	433	set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page page, struct page buddy,
438	if (page_zone_id(page) != page_zone_id(buddy))	485	if (page_zone_id(page) != page_zone_id(buddy))
439	return 0;	486	return 0;
440		487
		488	if (page_is_guard(buddy) && page_order(buddy) == order) {
		489	VM_BUG_ON(page_count(buddy) != 0);
		490	return 1;
		491	}
		492
441	if (PageBuddy(buddy) && page_order(buddy) == order) {	493	if (PageBuddy(buddy) && page_order(buddy) == order) {
442	VM_BUG_ON(page_count(buddy) != 0);	494	VM_BUG_ON(page_count(buddy) != 0);
443	return 1;	495	return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
494	buddy = page + (buddy_idx - page_idx);	546	buddy = page + (buddy_idx - page_idx);
495	if (!page_is_buddy(page, buddy, order))	547	if (!page_is_buddy(page, buddy, order))
496	break;	548	break;
497		549	/*
498	/* Our buddy is free, merge with it and move up one order. */	550	* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
499	list_del(&buddy->lru);	551	* merge with it and move up one order.
500	zone->free_area[order].nr_free--;	552	*/
501	rmv_page_order(buddy);	553	if (page_is_guard(buddy)) {
		554	clear_page_guard_flag(buddy);
		555	set_page_private(page, 0);
		556	__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
		557	} else {
		558	list_del(&buddy->lru);
		559	zone->free_area[order].nr_free--;
		560	rmv_page_order(buddy);
		561	}
502	combined_idx = buddy_idx & page_idx;	562	combined_idx = buddy_idx & page_idx;
503	page = page + (combined_idx - page_idx);	563	page = page + (combined_idx - page_idx);
504	page_idx = combined_idx;	564	page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
632	int i;	692	int i;
633	int bad = 0;	693	int bad = 0;
634		694
635	trace_mm_page_free_direct(page, order);	695	trace_mm_page_free(page, order);
636	kmemcheck_free_shadow(page, order);	696	kmemcheck_free_shadow(page, order);
637		697
638	if (PageAnon(page))	698	if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
670	local_irq_restore(flags);	730	local_irq_restore(flags);
671	}	731	}
672		732
673	/*
674	* permit the bootmem allocator to evade page validation on high-order frees
675	*/
676	void __meminit __free_pages_bootmem(struct page *page, unsigned int order)	733	void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
677	{	734	{
678	if (order == 0) {	735	unsigned int nr_pages = 1 << order;
679	__ClearPageReserved(page);	736	unsigned int loop;
680	set_page_count(page, 0);
681	set_page_refcounted(page);
682	__free_page(page);
683	} else {
684	int loop;
685
686	prefetchw(page);
687	for (loop = 0; loop < (1 << order); loop++) {
688	struct page *p = &page[loop];
689		737
690	if (loop + 1 < (1 << order))	738	prefetchw(page);
691	prefetchw(p + 1);	739	for (loop = 0; loop < nr_pages; loop++) {
692	__ClearPageReserved(p);	740	struct page *p = &page[loop];
693	set_page_count(p, 0);
694	}
695		741
696	set_page_refcounted(page);	742	if (loop + 1 < nr_pages)
697	__free_pages(page, order);	743	prefetchw(p + 1);
		744	__ClearPageReserved(p);
		745	set_page_count(p, 0);
698	}	746	}
		747
		748	set_page_refcounted(page);
		749	__free_pages(page, order);
699	}	750	}
700		751
701		752
@@ -724,6 +775,23 @@ static inline void expand(struct zone zone, struct page page,
724	high--;	775	high--;
725	size >>= 1;	776	size >>= 1;
726	VM_BUG_ON(bad_range(zone, &page[size]));	777	VM_BUG_ON(bad_range(zone, &page[size]));
		778
		779	#ifdef CONFIG_DEBUG_PAGEALLOC
		780	if (high < debug_guardpage_minorder()) {
		781	/*
		782	* Mark as guard pages (or page), that will allow to
		783	* merge back to allocator when buddy will be freed.
		784	* Corresponding page table entries will not be touched,
		785	* pages will stay not present in virtual address space
		786	*/
		787	INIT_LIST_HEAD(&page[size].lru);
		788	set_page_guard_flag(&page[size]);
		789	set_page_private(&page[size], high);
		790	/* Guard pages are not available for any usage */
		791	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
		792	continue;
		793	}
		794	#endif
727	list_add(&page[size].lru, &area->free_list[migratetype]);	795	list_add(&page[size].lru, &area->free_list[migratetype]);
728	area->nr_free++;	796	area->nr_free++;
729	set_page_order(&page[size], high);	797	set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
1189	}	1257	}
1190		1258
1191	/*	1259	/*
		1260	* Free a list of 0-order pages
		1261	*/
		1262	void free_hot_cold_page_list(struct list_head *list, int cold)
		1263	{
		1264	struct page page, next;
		1265
		1266	list_for_each_entry_safe(page, next, list, lru) {
		1267	trace_mm_page_free_batched(page, cold);
		1268	free_hot_cold_page(page, cold);
		1269	}
		1270	}
		1271
		1272	/*
1192	* split_page takes a non-compound higher-order page, and splits it into	1273	* split_page takes a non-compound higher-order page, and splits it into
1193	* n (1<<order) sub-pages: page[0..n]	1274	* n (1<<order) sub-pages: page[0..n]
1194	* Each sub-page must be freed individually.	1275	* Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1435	long min = mark;	1516	long min = mark;
1436	int o;	1517	int o;
1437		1518
1438	free_pages -= (1 << order) + 1;	1519	free_pages -= (1 << order) - 1;
1439	if (alloc_flags & ALLOC_HIGH)	1520	if (alloc_flags & ALLOC_HIGH)
1440	min -= min / 2;	1521	min -= min / 2;
1441	if (alloc_flags & ALLOC_HARDER)	1522	if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
1645	if ((alloc_flags & ALLOC_CPUSET) &&	1726	if ((alloc_flags & ALLOC_CPUSET) &&
1646	!cpuset_zone_allowed_softwall(zone, gfp_mask))	1727	!cpuset_zone_allowed_softwall(zone, gfp_mask))
1647	continue;	1728	continue;
		1729	/*
		1730	* When allocating a page cache page for writing, we
		1731	* want to get it from a zone that is within its dirty
		1732	* limit, such that no single zone holds more than its
		1733	* proportional share of globally allowed dirty pages.
		1734	* The dirty limits take into account the zone's
		1735	* lowmem reserves and high watermark so that kswapd
		1736	* should be able to balance it without having to
		1737	* write pages from its LRU list.
		1738	*
		1739	* This may look like it could increase pressure on
		1740	* lower zones by failing allocations in higher zones
		1741	* before they are full. But the pages that do spill
		1742	* over are limited as the lower zones are protected
		1743	* by this very same mechanism. It should not become
		1744	* a practical burden to them.
		1745	*
		1746	* XXX: For now, allow allocations to potentially
		1747	* exceed the per-zone dirty limit in the slowpath
		1748	* (ALLOC_WMARK_LOW unset) before going into reclaim,
		1749	* which is important when on a NUMA setup the allowed
		1750	* zones are together not big enough to reach the
		1751	* global limit. The proper fix for these situations
		1752	* will require awareness of zones in the
		1753	* dirty-throttling and the flusher threads.
		1754	*/
		1755	if ((alloc_flags & ALLOC_WMARK_LOW) &&
		1756	(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
		1757	goto this_zone_full;
1648		1758
1649	BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);	1759	BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1650	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {	1760	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1734	{	1844	{
1735	unsigned int filter = SHOW_MEM_FILTER_NODES;	1845	unsigned int filter = SHOW_MEM_FILTER_NODES;
1736		1846
1737	if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs))	1847	if ((gfp_mask & __GFP_NOWARN) \|\| !__ratelimit(&nopage_rs) \|\|
		1848	debug_guardpage_minorder() > 0)
1738	return;	1849	return;
1739		1850
1740	/*	1851	/*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1773		1884
1774	static inline int	1885	static inline int
1775	should_alloc_retry(gfp_t gfp_mask, unsigned int order,	1886	should_alloc_retry(gfp_t gfp_mask, unsigned int order,
		1887	unsigned long did_some_progress,
1776	unsigned long pages_reclaimed)	1888	unsigned long pages_reclaimed)
1777	{	1889	{
1778	/* Do not loop if specifically requested */	1890	/* Do not loop if specifically requested */
1779	if (gfp_mask & __GFP_NORETRY)	1891	if (gfp_mask & __GFP_NORETRY)
1780	return 0;	1892	return 0;
1781		1893
		1894	/* Always retry if specifically requested */
		1895	if (gfp_mask & __GFP_NOFAIL)
		1896	return 1;
		1897
		1898	/*
		1899	* Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
		1900	* making forward progress without invoking OOM. Suspend also disables
		1901	* storage devices so kswapd will not help. Bail if we are suspending.
		1902	*/
		1903	if (!did_some_progress && pm_suspended_storage())
		1904	return 0;
		1905
1782	/*	1906	/*
1783	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER	1907	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1784	* means __GFP_NOFAIL, but that may not be true in other	1908	* means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1797	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))	1921	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1798	return 1;	1922	return 1;
1799		1923
1800	/*
1801	* Don't let big-order allocations loop unless the caller
1802	* explicitly requests that.
1803	*/
1804	if (gfp_mask & __GFP_NOFAIL)
1805	return 1;
1806
1807	return 0;	1924	return 0;
1808	}	1925	}
1809		1926
@@ -2196,7 +2313,8 @@ rebalance:
2196		2313
2197	/* Check if we should retry the allocation */	2314	/* Check if we should retry the allocation */
2198	pages_reclaimed += did_some_progress;	2315	pages_reclaimed += did_some_progress;
2199	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {	2316	if (should_alloc_retry(gfp_mask, order, did_some_progress,
		2317	pages_reclaimed)) {
2200	/* Wait for some write requests to complete then retry */	2318	/* Wait for some write requests to complete then retry */
2201	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);	2319	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2202	goto rebalance;	2320	goto rebalance;
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
2306	}	2424	}
2307	EXPORT_SYMBOL(get_zeroed_page);	2425	EXPORT_SYMBOL(get_zeroed_page);
2308		2426
2309	void __pagevec_free(struct pagevec *pvec)
2310	{
2311	int i = pagevec_count(pvec);
2312
2313	while (--i >= 0) {
2314	trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2315	free_hot_cold_page(pvec->pages[i], pvec->cold);
2316	}
2317	}
2318
2319	void __free_pages(struct page *page, unsigned int order)	2427	void __free_pages(struct page *page, unsigned int order)
2320	{	2428	{
2321	if (put_page_testzero(page)) {	2429	if (put_page_testzero(page)) {
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3385	if (page_to_nid(page) != zone_to_nid(zone))	3493	if (page_to_nid(page) != zone_to_nid(zone))
3386	continue;	3494	continue;
3387		3495
3388	/* Blocks with reserved pages will never free, skip them. */
3389	block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3390	if (pageblock_is_reserved(pfn, block_end_pfn))
3391	continue;
3392
3393	block_migratetype = get_pageblock_migratetype(page);	3496	block_migratetype = get_pageblock_migratetype(page);
3394		3497
3395	/* If this block is reserved, account for it */	3498	/* Only test what is necessary when the reserves are not met */
3396	if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {	3499	if (reserve > 0) {
3397	reserve--;	3500	/*
3398	continue;	3501	* Blocks with reserved pages will never free, skip
3399	}	3502	* them.
		3503	*/
		3504	block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
		3505	if (pageblock_is_reserved(pfn, block_end_pfn))
		3506	continue;
3400		3507
3401	/* Suitable for reserving if this block is movable */	3508	/* If this block is reserved, account for it */
3402	if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {	3509	if (block_migratetype == MIGRATE_RESERVE) {
3403	set_pageblock_migratetype(page, MIGRATE_RESERVE);	3510	reserve--;
3404	move_freepages_block(zone, page, MIGRATE_RESERVE);	3511	continue;
3405	reserve--;	3512	}
3406	continue;	3513
		3514	/* Suitable for reserving if this block is movable */
		3515	if (block_migratetype == MIGRATE_MOVABLE) {
		3516	set_pageblock_migratetype(page,
		3517	MIGRATE_RESERVE);
		3518	move_freepages_block(zone, page,
		3519	MIGRATE_RESERVE);
		3520	reserve--;
		3521	continue;
		3522	}
3407	}	3523	}
3408		3524
3409	/*	3525	/*
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void)
4734	if (max > zone->present_pages)	4850	if (max > zone->present_pages)
4735	max = zone->present_pages;	4851	max = zone->present_pages;
4736	reserve_pages += max;	4852	reserve_pages += max;
		4853	/*
		4854	* Lowmem reserves are not available to
		4855	* GFP_HIGHUSER page cache allocations and
		4856	* kswapd tries to balance zones to their high
		4857	* watermark. As a result, neither should be
		4858	* regarded as dirtyable memory, to prevent a
		4859	* situation where reclaim has to clean pages
		4860	* in order to balance the zones.
		4861	*/
		4862	zone->dirty_balance_reserve = max;
4737	}	4863	}
4738	}	4864	}
		4865	dirty_balance_reserve = reserve_pages;
4739	totalreserve_pages = reserve_pages;	4866	totalreserve_pages = reserve_pages;
4740	}	4867	}
4741		4868