aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c848
1 files changed, 339 insertions, 509 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9dd443d89d8b..d2186ecb36f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/page-debug-flags.h>
60 61
61#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
62#include <asm/div64.h> 63#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
96 97
97unsigned long totalram_pages __read_mostly; 98unsigned long totalram_pages __read_mostly;
98unsigned long totalreserve_pages __read_mostly; 99unsigned long totalreserve_pages __read_mostly;
100/*
101 * When calculating the number of globally allowed dirty pages, there
102 * is a certain number of per-zone reserves that should not be
103 * considered dirtyable memory. This is the sum of those reserves
104 * over all existing zones that contribute dirtyable memory.
105 */
106unsigned long dirty_balance_reserve __read_mostly;
107
99int percpu_pagelist_fraction; 108int percpu_pagelist_fraction;
100gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 109gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
101 110
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
127 saved_gfp_mask = gfp_allowed_mask; 136 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS; 137 gfp_allowed_mask &= ~GFP_IOFS;
129} 138}
139
140bool pm_suspended_storage(void)
141{
142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
143 return false;
144 return true;
145}
130#endif /* CONFIG_PM_SLEEP */ 146#endif /* CONFIG_PM_SLEEP */
131 147
132#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 148#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -181,39 +197,17 @@ static unsigned long __meminitdata nr_kernel_pages;
181static unsigned long __meminitdata nr_all_pages; 197static unsigned long __meminitdata nr_all_pages;
182static unsigned long __meminitdata dma_reserve; 198static unsigned long __meminitdata dma_reserve;
183 199
184#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 200#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
185 /* 201static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
186 * MAX_ACTIVE_REGIONS determines the maximum number of distinct 202static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
187 * ranges of memory (RAM) that may be registered with add_active_range(). 203static unsigned long __initdata required_kernelcore;
188 * Ranges passed to add_active_range() will be merged if possible 204static unsigned long __initdata required_movablecore;
189 * so the number of times add_active_range() can be called is 205static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
190 * related to the number of nodes and the number of holes 206
191 */ 207/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
192 #ifdef CONFIG_MAX_ACTIVE_REGIONS 208int movable_zone;
193 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 209EXPORT_SYMBOL(movable_zone);
194 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 210#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
195 #else
196 #if MAX_NUMNODES >= 32
197 /* If there can be many nodes, allow up to 50 holes per node */
198 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
199 #else
200 /* By default, allow up to 256 distinct regions */
201 #define MAX_ACTIVE_REGIONS 256
202 #endif
203 #endif
204
205 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
206 static int __meminitdata nr_nodemap_entries;
207 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
208 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
209 static unsigned long __initdata required_kernelcore;
210 static unsigned long __initdata required_movablecore;
211 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
212
213 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
214 int movable_zone;
215 EXPORT_SYMBOL(movable_zone);
216#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
217 211
218#if MAX_NUMNODES > 1 212#if MAX_NUMNODES > 1
219int nr_node_ids __read_mostly = MAX_NUMNODES; 213int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -333,8 +327,8 @@ out:
333 * 327 *
334 * The remaining PAGE_SIZE pages are called "tail pages". 328 * The remaining PAGE_SIZE pages are called "tail pages".
335 * 329 *
336 * All pages have PG_compound set. All pages have their ->private pointing at 330 * All pages have PG_compound set. All tail pages have their ->first_page
337 * the head page (even the head page has this). 331 * pointing at the head page.
338 * 332 *
339 * The first tail page's ->lru.next holds the address of the compound page's 333 * The first tail page's ->lru.next holds the address of the compound page's
340 * put_page() function. Its ->lru.prev holds the order of allocation. 334 * put_page() function. Its ->lru.prev holds the order of allocation.
@@ -356,8 +350,8 @@ void prep_compound_page(struct page *page, unsigned long order)
356 __SetPageHead(page); 350 __SetPageHead(page);
357 for (i = 1; i < nr_pages; i++) { 351 for (i = 1; i < nr_pages; i++) {
358 struct page *p = page + i; 352 struct page *p = page + i;
359
360 __SetPageTail(p); 353 __SetPageTail(p);
354 set_page_count(p, 0);
361 p->first_page = page; 355 p->first_page = page;
362 } 356 }
363} 357}
@@ -403,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
403 clear_highpage(page + i); 397 clear_highpage(page + i);
404} 398}
405 399
400#ifdef CONFIG_DEBUG_PAGEALLOC
401unsigned int _debug_guardpage_minorder;
402
403static int __init debug_guardpage_minorder_setup(char *buf)
404{
405 unsigned long res;
406
407 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
408 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
409 return 0;
410 }
411 _debug_guardpage_minorder = res;
412 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
413 return 0;
414}
415__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
416
417static inline void set_page_guard_flag(struct page *page)
418{
419 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
420}
421
422static inline void clear_page_guard_flag(struct page *page)
423{
424 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
425}
426#else
427static inline void set_page_guard_flag(struct page *page) { }
428static inline void clear_page_guard_flag(struct page *page) { }
429#endif
430
406static inline void set_page_order(struct page *page, int order) 431static inline void set_page_order(struct page *page, int order)
407{ 432{
408 set_page_private(page, order); 433 set_page_private(page, order);
@@ -460,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
460 if (page_zone_id(page) != page_zone_id(buddy)) 485 if (page_zone_id(page) != page_zone_id(buddy))
461 return 0; 486 return 0;
462 487
488 if (page_is_guard(buddy) && page_order(buddy) == order) {
489 VM_BUG_ON(page_count(buddy) != 0);
490 return 1;
491 }
492
463 if (PageBuddy(buddy) && page_order(buddy) == order) { 493 if (PageBuddy(buddy) && page_order(buddy) == order) {
464 VM_BUG_ON(page_count(buddy) != 0); 494 VM_BUG_ON(page_count(buddy) != 0);
465 return 1; 495 return 1;
@@ -516,11 +546,19 @@ static inline void __free_one_page(struct page *page,
516 buddy = page + (buddy_idx - page_idx); 546 buddy = page + (buddy_idx - page_idx);
517 if (!page_is_buddy(page, buddy, order)) 547 if (!page_is_buddy(page, buddy, order))
518 break; 548 break;
519 549 /*
520 /* Our buddy is free, merge with it and move up one order. */ 550 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
521 list_del(&buddy->lru); 551 * merge with it and move up one order.
522 zone->free_area[order].nr_free--; 552 */
523 rmv_page_order(buddy); 553 if (page_is_guard(buddy)) {
554 clear_page_guard_flag(buddy);
555 set_page_private(page, 0);
556 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
557 } else {
558 list_del(&buddy->lru);
559 zone->free_area[order].nr_free--;
560 rmv_page_order(buddy);
561 }
524 combined_idx = buddy_idx & page_idx; 562 combined_idx = buddy_idx & page_idx;
525 page = page + (combined_idx - page_idx); 563 page = page + (combined_idx - page_idx);
526 page_idx = combined_idx; 564 page_idx = combined_idx;
@@ -654,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
654 int i; 692 int i;
655 int bad = 0; 693 int bad = 0;
656 694
657 trace_mm_page_free_direct(page, order); 695 trace_mm_page_free(page, order);
658 kmemcheck_free_shadow(page, order); 696 kmemcheck_free_shadow(page, order);
659 697
660 if (PageAnon(page)) 698 if (PageAnon(page))
@@ -692,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
692 local_irq_restore(flags); 730 local_irq_restore(flags);
693} 731}
694 732
695/*
696 * permit the bootmem allocator to evade page validation on high-order frees
697 */
698void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 733void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
699{ 734{
700 if (order == 0) { 735 unsigned int nr_pages = 1 << order;
701 __ClearPageReserved(page); 736 unsigned int loop;
702 set_page_count(page, 0);
703 set_page_refcounted(page);
704 __free_page(page);
705 } else {
706 int loop;
707 737
708 prefetchw(page); 738 prefetchw(page);
709 for (loop = 0; loop < BITS_PER_LONG; loop++) { 739 for (loop = 0; loop < nr_pages; loop++) {
710 struct page *p = &page[loop]; 740 struct page *p = &page[loop];
711 741
712 if (loop + 1 < BITS_PER_LONG) 742 if (loop + 1 < nr_pages)
713 prefetchw(p + 1); 743 prefetchw(p + 1);
714 __ClearPageReserved(p); 744 __ClearPageReserved(p);
715 set_page_count(p, 0); 745 set_page_count(p, 0);
716 }
717
718 set_page_refcounted(page);
719 __free_pages(page, order);
720 } 746 }
747
748 set_page_refcounted(page);
749 __free_pages(page, order);
721} 750}
722 751
723 752
@@ -746,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
746 high--; 775 high--;
747 size >>= 1; 776 size >>= 1;
748 VM_BUG_ON(bad_range(zone, &page[size])); 777 VM_BUG_ON(bad_range(zone, &page[size]));
778
779#ifdef CONFIG_DEBUG_PAGEALLOC
780 if (high < debug_guardpage_minorder()) {
781 /*
782 * Mark as guard pages (or page), that will allow to
783 * merge back to allocator when buddy will be freed.
784 * Corresponding page table entries will not be touched,
785 * pages will stay not present in virtual address space
786 */
787 INIT_LIST_HEAD(&page[size].lru);
788 set_page_guard_flag(&page[size]);
789 set_page_private(&page[size], high);
790 /* Guard pages are not available for any usage */
791 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
792 continue;
793 }
794#endif
749 list_add(&page[size].lru, &area->free_list[migratetype]); 795 list_add(&page[size].lru, &area->free_list[migratetype]);
750 area->nr_free++; 796 area->nr_free++;
751 set_page_order(&page[size], high); 797 set_page_order(&page[size], high);
@@ -1211,6 +1257,19 @@ out:
1211} 1257}
1212 1258
1213/* 1259/*
1260 * Free a list of 0-order pages
1261 */
1262void free_hot_cold_page_list(struct list_head *list, int cold)
1263{
1264 struct page *page, *next;
1265
1266 list_for_each_entry_safe(page, next, list, lru) {
1267 trace_mm_page_free_batched(page, cold);
1268 free_hot_cold_page(page, cold);
1269 }
1270}
1271
1272/*
1214 * split_page takes a non-compound higher-order page, and splits it into 1273 * split_page takes a non-compound higher-order page, and splits it into
1215 * n (1<<order) sub-pages: page[0..n] 1274 * n (1<<order) sub-pages: page[0..n]
1216 * Each sub-page must be freed individually. 1275 * Each sub-page must be freed individually.
@@ -1408,7 +1467,7 @@ static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1408 1467
1409static int __init fail_page_alloc_debugfs(void) 1468static int __init fail_page_alloc_debugfs(void)
1410{ 1469{
1411 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1470 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1412 struct dentry *dir; 1471 struct dentry *dir;
1413 1472
1414 dir = fault_create_debugfs_attr("fail_page_alloc", NULL, 1473 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
@@ -1457,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1457 long min = mark; 1516 long min = mark;
1458 int o; 1517 int o;
1459 1518
1460 free_pages -= (1 << order) + 1; 1519 free_pages -= (1 << order) - 1;
1461 if (alloc_flags & ALLOC_HIGH) 1520 if (alloc_flags & ALLOC_HIGH)
1462 min -= min / 2; 1521 min -= min / 2;
1463 if (alloc_flags & ALLOC_HARDER) 1522 if (alloc_flags & ALLOC_HARDER)
@@ -1667,6 +1726,35 @@ zonelist_scan:
1667 if ((alloc_flags & ALLOC_CPUSET) && 1726 if ((alloc_flags & ALLOC_CPUSET) &&
1668 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1727 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1669 continue; 1728 continue;
1729 /*
1730 * When allocating a page cache page for writing, we
1731 * want to get it from a zone that is within its dirty
1732 * limit, such that no single zone holds more than its
1733 * proportional share of globally allowed dirty pages.
1734 * The dirty limits take into account the zone's
1735 * lowmem reserves and high watermark so that kswapd
1736 * should be able to balance it without having to
1737 * write pages from its LRU list.
1738 *
1739 * This may look like it could increase pressure on
1740 * lower zones by failing allocations in higher zones
1741 * before they are full. But the pages that do spill
1742 * over are limited as the lower zones are protected
1743 * by this very same mechanism. It should not become
1744 * a practical burden to them.
1745 *
1746 * XXX: For now, allow allocations to potentially
1747 * exceed the per-zone dirty limit in the slowpath
1748 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1749 * which is important when on a NUMA setup the allowed
1750 * zones are together not big enough to reach the
1751 * global limit. The proper fix for these situations
1752 * will require awareness of zones in the
1753 * dirty-throttling and the flusher threads.
1754 */
1755 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1756 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1757 goto this_zone_full;
1670 1758
1671 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1759 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1672 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1760 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1756,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1756{ 1844{
1757 unsigned int filter = SHOW_MEM_FILTER_NODES; 1845 unsigned int filter = SHOW_MEM_FILTER_NODES;
1758 1846
1759 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 1847 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1848 debug_guardpage_minorder() > 0)
1760 return; 1849 return;
1761 1850
1762 /* 1851 /*
@@ -1795,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1795 1884
1796static inline int 1885static inline int
1797should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1886should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1887 unsigned long did_some_progress,
1798 unsigned long pages_reclaimed) 1888 unsigned long pages_reclaimed)
1799{ 1889{
1800 /* Do not loop if specifically requested */ 1890 /* Do not loop if specifically requested */
1801 if (gfp_mask & __GFP_NORETRY) 1891 if (gfp_mask & __GFP_NORETRY)
1802 return 0; 1892 return 0;
1803 1893
1894 /* Always retry if specifically requested */
1895 if (gfp_mask & __GFP_NOFAIL)
1896 return 1;
1897
1898 /*
1899 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
1900 * making forward progress without invoking OOM. Suspend also disables
1901 * storage devices so kswapd will not help. Bail if we are suspending.
1902 */
1903 if (!did_some_progress && pm_suspended_storage())
1904 return 0;
1905
1804 /* 1906 /*
1805 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1907 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1806 * means __GFP_NOFAIL, but that may not be true in other 1908 * means __GFP_NOFAIL, but that may not be true in other
@@ -1819,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1819 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 1921 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1820 return 1; 1922 return 1;
1821 1923
1822 /*
1823 * Don't let big-order allocations loop unless the caller
1824 * explicitly requests that.
1825 */
1826 if (gfp_mask & __GFP_NOFAIL)
1827 return 1;
1828
1829 return 0; 1924 return 0;
1830} 1925}
1831 1926
@@ -1886,14 +1981,20 @@ static struct page *
1886__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1981__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1887 struct zonelist *zonelist, enum zone_type high_zoneidx, 1982 struct zonelist *zonelist, enum zone_type high_zoneidx,
1888 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1983 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1889 int migratetype, unsigned long *did_some_progress, 1984 int migratetype, bool sync_migration,
1890 bool sync_migration) 1985 bool *deferred_compaction,
1986 unsigned long *did_some_progress)
1891{ 1987{
1892 struct page *page; 1988 struct page *page;
1893 1989
1894 if (!order || compaction_deferred(preferred_zone)) 1990 if (!order)
1895 return NULL; 1991 return NULL;
1896 1992
1993 if (compaction_deferred(preferred_zone)) {
1994 *deferred_compaction = true;
1995 return NULL;
1996 }
1997
1897 current->flags |= PF_MEMALLOC; 1998 current->flags |= PF_MEMALLOC;
1898 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1999 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1899 nodemask, sync_migration); 2000 nodemask, sync_migration);
@@ -1921,7 +2022,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1921 * but not enough to satisfy watermarks. 2022 * but not enough to satisfy watermarks.
1922 */ 2023 */
1923 count_vm_event(COMPACTFAIL); 2024 count_vm_event(COMPACTFAIL);
1924 defer_compaction(preferred_zone); 2025
2026 /*
2027 * As async compaction considers a subset of pageblocks, only
2028 * defer if the failure was a sync compaction failure.
2029 */
2030 if (sync_migration)
2031 defer_compaction(preferred_zone);
1925 2032
1926 cond_resched(); 2033 cond_resched();
1927 } 2034 }
@@ -1933,8 +2040,9 @@ static inline struct page *
1933__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2040__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1934 struct zonelist *zonelist, enum zone_type high_zoneidx, 2041 struct zonelist *zonelist, enum zone_type high_zoneidx,
1935 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2042 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1936 int migratetype, unsigned long *did_some_progress, 2043 int migratetype, bool sync_migration,
1937 bool sync_migration) 2044 bool *deferred_compaction,
2045 unsigned long *did_some_progress)
1938{ 2046{
1939 return NULL; 2047 return NULL;
1940} 2048}
@@ -2084,6 +2192,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2084 unsigned long pages_reclaimed = 0; 2192 unsigned long pages_reclaimed = 0;
2085 unsigned long did_some_progress; 2193 unsigned long did_some_progress;
2086 bool sync_migration = false; 2194 bool sync_migration = false;
2195 bool deferred_compaction = false;
2087 2196
2088 /* 2197 /*
2089 * In the slowpath, we sanity check order to avoid ever trying to 2198 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2164,12 +2273,22 @@ rebalance:
2164 zonelist, high_zoneidx, 2273 zonelist, high_zoneidx,
2165 nodemask, 2274 nodemask,
2166 alloc_flags, preferred_zone, 2275 alloc_flags, preferred_zone,
2167 migratetype, &did_some_progress, 2276 migratetype, sync_migration,
2168 sync_migration); 2277 &deferred_compaction,
2278 &did_some_progress);
2169 if (page) 2279 if (page)
2170 goto got_pg; 2280 goto got_pg;
2171 sync_migration = true; 2281 sync_migration = true;
2172 2282
2283 /*
2284 * If compaction is deferred for high-order allocations, it is because
2285 * sync compaction recently failed. In this is the case and the caller
2286 * has requested the system not be heavily disrupted, fail the
2287 * allocation now instead of entering direct reclaim
2288 */
2289 if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
2290 goto nopage;
2291
2173 /* Try direct reclaim and then allocating */ 2292 /* Try direct reclaim and then allocating */
2174 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2293 page = __alloc_pages_direct_reclaim(gfp_mask, order,
2175 zonelist, high_zoneidx, 2294 zonelist, high_zoneidx,
@@ -2218,7 +2337,8 @@ rebalance:
2218 2337
2219 /* Check if we should retry the allocation */ 2338 /* Check if we should retry the allocation */
2220 pages_reclaimed += did_some_progress; 2339 pages_reclaimed += did_some_progress;
2221 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2340 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2341 pages_reclaimed)) {
2222 /* Wait for some write requests to complete then retry */ 2342 /* Wait for some write requests to complete then retry */
2223 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2343 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2224 goto rebalance; 2344 goto rebalance;
@@ -2232,8 +2352,9 @@ rebalance:
2232 zonelist, high_zoneidx, 2352 zonelist, high_zoneidx,
2233 nodemask, 2353 nodemask,
2234 alloc_flags, preferred_zone, 2354 alloc_flags, preferred_zone,
2235 migratetype, &did_some_progress, 2355 migratetype, sync_migration,
2236 sync_migration); 2356 &deferred_compaction,
2357 &did_some_progress);
2237 if (page) 2358 if (page)
2238 goto got_pg; 2359 goto got_pg;
2239 } 2360 }
@@ -2328,16 +2449,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
2328} 2449}
2329EXPORT_SYMBOL(get_zeroed_page); 2450EXPORT_SYMBOL(get_zeroed_page);
2330 2451
2331void __pagevec_free(struct pagevec *pvec)
2332{
2333 int i = pagevec_count(pvec);
2334
2335 while (--i >= 0) {
2336 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2337 free_hot_cold_page(pvec->pages[i], pvec->cold);
2338 }
2339}
2340
2341void __free_pages(struct page *page, unsigned int order) 2452void __free_pages(struct page *page, unsigned int order)
2342{ 2453{
2343 if (put_page_testzero(page)) { 2454 if (put_page_testzero(page)) {
@@ -3377,9 +3488,15 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3377 unsigned long block_migratetype; 3488 unsigned long block_migratetype;
3378 int reserve; 3489 int reserve;
3379 3490
3380 /* Get the start pfn, end pfn and the number of blocks to reserve */ 3491 /*
3492 * Get the start pfn, end pfn and the number of blocks to reserve
3493 * We have to be careful to be aligned to pageblock_nr_pages to
3494 * make sure that we always check pfn_valid for the first page in
3495 * the block.
3496 */
3381 start_pfn = zone->zone_start_pfn; 3497 start_pfn = zone->zone_start_pfn;
3382 end_pfn = start_pfn + zone->spanned_pages; 3498 end_pfn = start_pfn + zone->spanned_pages;
3499 start_pfn = roundup(start_pfn, pageblock_nr_pages);
3383 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> 3500 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
3384 pageblock_order; 3501 pageblock_order;
3385 3502
@@ -3401,25 +3518,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3401 if (page_to_nid(page) != zone_to_nid(zone)) 3518 if (page_to_nid(page) != zone_to_nid(zone))
3402 continue; 3519 continue;
3403 3520
3404 /* Blocks with reserved pages will never free, skip them. */
3405 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3406 if (pageblock_is_reserved(pfn, block_end_pfn))
3407 continue;
3408
3409 block_migratetype = get_pageblock_migratetype(page); 3521 block_migratetype = get_pageblock_migratetype(page);
3410 3522
3411 /* If this block is reserved, account for it */ 3523 /* Only test what is necessary when the reserves are not met */
3412 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 3524 if (reserve > 0) {
3413 reserve--; 3525 /*
3414 continue; 3526 * Blocks with reserved pages will never free, skip
3415 } 3527 * them.
3528 */
3529 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3530 if (pageblock_is_reserved(pfn, block_end_pfn))
3531 continue;
3416 3532
3417 /* Suitable for reserving if this block is movable */ 3533 /* If this block is reserved, account for it */
3418 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 3534 if (block_migratetype == MIGRATE_RESERVE) {
3419 set_pageblock_migratetype(page, MIGRATE_RESERVE); 3535 reserve--;
3420 move_freepages_block(zone, page, MIGRATE_RESERVE); 3536 continue;
3421 reserve--; 3537 }
3422 continue; 3538
3539 /* Suitable for reserving if this block is movable */
3540 if (block_migratetype == MIGRATE_MOVABLE) {
3541 set_pageblock_migratetype(page,
3542 MIGRATE_RESERVE);
3543 move_freepages_block(zone, page,
3544 MIGRATE_RESERVE);
3545 reserve--;
3546 continue;
3547 }
3423 } 3548 }
3424 3549
3425 /* 3550 /*
@@ -3731,35 +3856,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
3731 return 0; 3856 return 0;
3732} 3857}
3733 3858
3734#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 3859#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
3735/*
3736 * Basic iterator support. Return the first range of PFNs for a node
3737 * Note: nid == MAX_NUMNODES returns first region regardless of node
3738 */
3739static int __meminit first_active_region_index_in_nid(int nid)
3740{
3741 int i;
3742
3743 for (i = 0; i < nr_nodemap_entries; i++)
3744 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3745 return i;
3746
3747 return -1;
3748}
3749
3750/*
3751 * Basic iterator support. Return the next active range of PFNs for a node
3752 * Note: nid == MAX_NUMNODES returns next region regardless of node
3753 */
3754static int __meminit next_active_region_index_in_nid(int index, int nid)
3755{
3756 for (index = index + 1; index < nr_nodemap_entries; index++)
3757 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3758 return index;
3759
3760 return -1;
3761}
3762
3763#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 3860#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
3764/* 3861/*
3765 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 3862 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
@@ -3769,15 +3866,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid)
3769 */ 3866 */
3770int __meminit __early_pfn_to_nid(unsigned long pfn) 3867int __meminit __early_pfn_to_nid(unsigned long pfn)
3771{ 3868{
3772 int i; 3869 unsigned long start_pfn, end_pfn;
3773 3870 int i, nid;
3774 for (i = 0; i < nr_nodemap_entries; i++) {
3775 unsigned long start_pfn = early_node_map[i].start_pfn;
3776 unsigned long end_pfn = early_node_map[i].end_pfn;
3777 3871
3872 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
3778 if (start_pfn <= pfn && pfn < end_pfn) 3873 if (start_pfn <= pfn && pfn < end_pfn)
3779 return early_node_map[i].nid; 3874 return nid;
3780 }
3781 /* This is a memory hole */ 3875 /* This is a memory hole */
3782 return -1; 3876 return -1;
3783} 3877}
@@ -3806,11 +3900,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3806} 3900}
3807#endif 3901#endif
3808 3902
3809/* Basic iterator support to walk early_node_map[] */
3810#define for_each_active_range_index_in_nid(i, nid) \
3811 for (i = first_active_region_index_in_nid(nid); i != -1; \
3812 i = next_active_region_index_in_nid(i, nid))
3813
3814/** 3903/**
3815 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 3904 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
3816 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 3905 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -3820,122 +3909,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
3820 * add_active_ranges() contain no holes and may be freed, this 3909 * add_active_ranges() contain no holes and may be freed, this
3821 * this function may be used instead of calling free_bootmem() manually. 3910 * this function may be used instead of calling free_bootmem() manually.
3822 */ 3911 */
3823void __init free_bootmem_with_active_regions(int nid, 3912void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3824 unsigned long max_low_pfn)
3825{
3826 int i;
3827
3828 for_each_active_range_index_in_nid(i, nid) {
3829 unsigned long size_pages = 0;
3830 unsigned long end_pfn = early_node_map[i].end_pfn;
3831
3832 if (early_node_map[i].start_pfn >= max_low_pfn)
3833 continue;
3834
3835 if (end_pfn > max_low_pfn)
3836 end_pfn = max_low_pfn;
3837
3838 size_pages = end_pfn - early_node_map[i].start_pfn;
3839 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
3840 PFN_PHYS(early_node_map[i].start_pfn),
3841 size_pages << PAGE_SHIFT);
3842 }
3843}
3844
3845#ifdef CONFIG_HAVE_MEMBLOCK
3846/*
3847 * Basic iterator support. Return the last range of PFNs for a node
3848 * Note: nid == MAX_NUMNODES returns last region regardless of node
3849 */
3850static int __meminit last_active_region_index_in_nid(int nid)
3851{
3852 int i;
3853
3854 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3855 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3856 return i;
3857
3858 return -1;
3859}
3860
3861/*
3862 * Basic iterator support. Return the previous active range of PFNs for a node
3863 * Note: nid == MAX_NUMNODES returns next region regardless of node
3864 */
3865static int __meminit previous_active_region_index_in_nid(int index, int nid)
3866{
3867 for (index = index - 1; index >= 0; index--)
3868 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3869 return index;
3870
3871 return -1;
3872}
3873
3874#define for_each_active_range_index_in_nid_reverse(i, nid) \
3875 for (i = last_active_region_index_in_nid(nid); i != -1; \
3876 i = previous_active_region_index_in_nid(i, nid))
3877
3878u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3879 u64 goal, u64 limit)
3880{ 3913{
3881 int i; 3914 unsigned long start_pfn, end_pfn;
3882 3915 int i, this_nid;
3883 /* Need to go over early_node_map to find out good range for node */
3884 for_each_active_range_index_in_nid_reverse(i, nid) {
3885 u64 addr;
3886 u64 ei_start, ei_last;
3887 u64 final_start, final_end;
3888
3889 ei_last = early_node_map[i].end_pfn;
3890 ei_last <<= PAGE_SHIFT;
3891 ei_start = early_node_map[i].start_pfn;
3892 ei_start <<= PAGE_SHIFT;
3893
3894 final_start = max(ei_start, goal);
3895 final_end = min(ei_last, limit);
3896
3897 if (final_start >= final_end)
3898 continue;
3899
3900 addr = memblock_find_in_range(final_start, final_end, size, align);
3901 3916
3902 if (addr == MEMBLOCK_ERROR) 3917 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
3903 continue; 3918 start_pfn = min(start_pfn, max_low_pfn);
3919 end_pfn = min(end_pfn, max_low_pfn);
3904 3920
3905 return addr; 3921 if (start_pfn < end_pfn)
3922 free_bootmem_node(NODE_DATA(this_nid),
3923 PFN_PHYS(start_pfn),
3924 (end_pfn - start_pfn) << PAGE_SHIFT);
3906 } 3925 }
3907
3908 return MEMBLOCK_ERROR;
3909} 3926}
3910#endif
3911 3927
3912int __init add_from_early_node_map(struct range *range, int az, 3928int __init add_from_early_node_map(struct range *range, int az,
3913 int nr_range, int nid) 3929 int nr_range, int nid)
3914{ 3930{
3931 unsigned long start_pfn, end_pfn;
3915 int i; 3932 int i;
3916 u64 start, end;
3917 3933
3918 /* need to go over early_node_map to find out good range for node */ 3934 /* need to go over early_node_map to find out good range for node */
3919 for_each_active_range_index_in_nid(i, nid) { 3935 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3920 start = early_node_map[i].start_pfn; 3936 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3921 end = early_node_map[i].end_pfn;
3922 nr_range = add_range(range, az, nr_range, start, end);
3923 }
3924 return nr_range; 3937 return nr_range;
3925} 3938}
3926 3939
3927void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3928{
3929 int i;
3930 int ret;
3931
3932 for_each_active_range_index_in_nid(i, nid) {
3933 ret = work_fn(early_node_map[i].start_pfn,
3934 early_node_map[i].end_pfn, data);
3935 if (ret)
3936 break;
3937 }
3938}
3939/** 3940/**
3940 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3941 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3941 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3942 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -3946,12 +3947,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3946 */ 3947 */
3947void __init sparse_memory_present_with_active_regions(int nid) 3948void __init sparse_memory_present_with_active_regions(int nid)
3948{ 3949{
3949 int i; 3950 unsigned long start_pfn, end_pfn;
3951 int i, this_nid;
3950 3952
3951 for_each_active_range_index_in_nid(i, nid) 3953 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
3952 memory_present(early_node_map[i].nid, 3954 memory_present(this_nid, start_pfn, end_pfn);
3953 early_node_map[i].start_pfn,
3954 early_node_map[i].end_pfn);
3955} 3955}
3956 3956
3957/** 3957/**
@@ -3968,13 +3968,15 @@ void __init sparse_memory_present_with_active_regions(int nid)
3968void __meminit get_pfn_range_for_nid(unsigned int nid, 3968void __meminit get_pfn_range_for_nid(unsigned int nid,
3969 unsigned long *start_pfn, unsigned long *end_pfn) 3969 unsigned long *start_pfn, unsigned long *end_pfn)
3970{ 3970{
3971 unsigned long this_start_pfn, this_end_pfn;
3971 int i; 3972 int i;
3973
3972 *start_pfn = -1UL; 3974 *start_pfn = -1UL;
3973 *end_pfn = 0; 3975 *end_pfn = 0;
3974 3976
3975 for_each_active_range_index_in_nid(i, nid) { 3977 for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
3976 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 3978 *start_pfn = min(*start_pfn, this_start_pfn);
3977 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 3979 *end_pfn = max(*end_pfn, this_end_pfn);
3978 } 3980 }
3979 3981
3980 if (*start_pfn == -1UL) 3982 if (*start_pfn == -1UL)
@@ -4077,46 +4079,16 @@ unsigned long __meminit __absent_pages_in_range(int nid,
4077 unsigned long range_start_pfn, 4079 unsigned long range_start_pfn,
4078 unsigned long range_end_pfn) 4080 unsigned long range_end_pfn)
4079{ 4081{
4080 int i = 0; 4082 unsigned long nr_absent = range_end_pfn - range_start_pfn;
4081 unsigned long prev_end_pfn = 0, hole_pages = 0; 4083 unsigned long start_pfn, end_pfn;
4082 unsigned long start_pfn; 4084 int i;
4083
4084 /* Find the end_pfn of the first active range of pfns in the node */
4085 i = first_active_region_index_in_nid(nid);
4086 if (i == -1)
4087 return 0;
4088
4089 prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4090
4091 /* Account for ranges before physical memory on this node */
4092 if (early_node_map[i].start_pfn > range_start_pfn)
4093 hole_pages = prev_end_pfn - range_start_pfn;
4094
4095 /* Find all holes for the zone within the node */
4096 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
4097
4098 /* No need to continue if prev_end_pfn is outside the zone */
4099 if (prev_end_pfn >= range_end_pfn)
4100 break;
4101
4102 /* Make sure the end of the zone is not within the hole */
4103 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
4104 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
4105 4085
4106 /* Update the hole size cound and move on */ 4086 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4107 if (start_pfn > range_start_pfn) { 4087 start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
4108 BUG_ON(prev_end_pfn > start_pfn); 4088 end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
4109 hole_pages += start_pfn - prev_end_pfn; 4089 nr_absent -= end_pfn - start_pfn;
4110 }
4111 prev_end_pfn = early_node_map[i].end_pfn;
4112 } 4090 }
4113 4091 return nr_absent;
4114 /* Account for ranges past physical memory on this node */
4115 if (range_end_pfn > prev_end_pfn)
4116 hole_pages += range_end_pfn -
4117 max(range_start_pfn, prev_end_pfn);
4118
4119 return hole_pages;
4120} 4092}
4121 4093
4122/** 4094/**
@@ -4137,14 +4109,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4137 unsigned long zone_type, 4109 unsigned long zone_type,
4138 unsigned long *ignored) 4110 unsigned long *ignored)
4139{ 4111{
4112 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4113 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4140 unsigned long node_start_pfn, node_end_pfn; 4114 unsigned long node_start_pfn, node_end_pfn;
4141 unsigned long zone_start_pfn, zone_end_pfn; 4115 unsigned long zone_start_pfn, zone_end_pfn;
4142 4116
4143 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 4117 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4144 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 4118 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4145 node_start_pfn); 4119 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4146 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
4147 node_end_pfn);
4148 4120
4149 adjust_zone_range_for_zone_movable(nid, zone_type, 4121 adjust_zone_range_for_zone_movable(nid, zone_type,
4150 node_start_pfn, node_end_pfn, 4122 node_start_pfn, node_end_pfn,
@@ -4152,7 +4124,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4152 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 4124 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
4153} 4125}
4154 4126
4155#else 4127#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4156static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4128static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4157 unsigned long zone_type, 4129 unsigned long zone_type,
4158 unsigned long *zones_size) 4130 unsigned long *zones_size)
@@ -4170,7 +4142,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4170 return zholes_size[zone_type]; 4142 return zholes_size[zone_type];
4171} 4143}
4172 4144
4173#endif 4145#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4174 4146
4175static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4147static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4176 unsigned long *zones_size, unsigned long *zholes_size) 4148 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4290,7 +4262,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4290 for (j = 0; j < MAX_NR_ZONES; j++) { 4262 for (j = 0; j < MAX_NR_ZONES; j++) {
4291 struct zone *zone = pgdat->node_zones + j; 4263 struct zone *zone = pgdat->node_zones + j;
4292 unsigned long size, realsize, memmap_pages; 4264 unsigned long size, realsize, memmap_pages;
4293 enum lru_list l; 4265 enum lru_list lru;
4294 4266
4295 size = zone_spanned_pages_in_node(nid, j, zones_size); 4267 size = zone_spanned_pages_in_node(nid, j, zones_size);
4296 realsize = size - zone_absent_pages_in_node(nid, j, 4268 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4340,8 +4312,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4340 zone->zone_pgdat = pgdat; 4312 zone->zone_pgdat = pgdat;
4341 4313
4342 zone_pcp_init(zone); 4314 zone_pcp_init(zone);
4343 for_each_lru(l) 4315 for_each_lru(lru)
4344 INIT_LIST_HEAD(&zone->lru[l].list); 4316 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4345 zone->reclaim_stat.recent_rotated[0] = 0; 4317 zone->reclaim_stat.recent_rotated[0] = 0;
4346 zone->reclaim_stat.recent_rotated[1] = 0; 4318 zone->reclaim_stat.recent_rotated[1] = 0;
4347 zone->reclaim_stat.recent_scanned[0] = 0; 4319 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4393,10 +4365,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4393 */ 4365 */
4394 if (pgdat == NODE_DATA(0)) { 4366 if (pgdat == NODE_DATA(0)) {
4395 mem_map = NODE_DATA(0)->node_mem_map; 4367 mem_map = NODE_DATA(0)->node_mem_map;
4396#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 4368#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4397 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 4369 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
4398 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); 4370 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
4399#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 4371#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4400 } 4372 }
4401#endif 4373#endif
4402#endif /* CONFIG_FLAT_NODE_MEM_MAP */ 4374#endif /* CONFIG_FLAT_NODE_MEM_MAP */
@@ -4421,7 +4393,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4421 free_area_init_core(pgdat, zones_size, zholes_size); 4393 free_area_init_core(pgdat, zones_size, zholes_size);
4422} 4394}
4423 4395
4424#ifdef CONFIG_ARCH_POPULATES_NODE_MAP 4396#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4425 4397
4426#if MAX_NUMNODES > 1 4398#if MAX_NUMNODES > 1
4427/* 4399/*
@@ -4443,170 +4415,6 @@ static inline void setup_nr_node_ids(void)
4443#endif 4415#endif
4444 4416
4445/** 4417/**
4446 * add_active_range - Register a range of PFNs backed by physical memory
4447 * @nid: The node ID the range resides on
4448 * @start_pfn: The start PFN of the available physical memory
4449 * @end_pfn: The end PFN of the available physical memory
4450 *
4451 * These ranges are stored in an early_node_map[] and later used by
4452 * free_area_init_nodes() to calculate zone sizes and holes. If the
4453 * range spans a memory hole, it is up to the architecture to ensure
4454 * the memory is not freed by the bootmem allocator. If possible
4455 * the range being registered will be merged with existing ranges.
4456 */
4457void __init add_active_range(unsigned int nid, unsigned long start_pfn,
4458 unsigned long end_pfn)
4459{
4460 int i;
4461
4462 mminit_dprintk(MMINIT_TRACE, "memory_register",
4463 "Entering add_active_range(%d, %#lx, %#lx) "
4464 "%d entries of %d used\n",
4465 nid, start_pfn, end_pfn,
4466 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
4467
4468 mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
4469
4470 /* Merge with existing active regions if possible */
4471 for (i = 0; i < nr_nodemap_entries; i++) {
4472 if (early_node_map[i].nid != nid)
4473 continue;
4474
4475 /* Skip if an existing region covers this new one */
4476 if (start_pfn >= early_node_map[i].start_pfn &&
4477 end_pfn <= early_node_map[i].end_pfn)
4478 return;
4479
4480 /* Merge forward if suitable */
4481 if (start_pfn <= early_node_map[i].end_pfn &&
4482 end_pfn > early_node_map[i].end_pfn) {
4483 early_node_map[i].end_pfn = end_pfn;
4484 return;
4485 }
4486
4487 /* Merge backward if suitable */
4488 if (start_pfn < early_node_map[i].start_pfn &&
4489 end_pfn >= early_node_map[i].start_pfn) {
4490 early_node_map[i].start_pfn = start_pfn;
4491 return;
4492 }
4493 }
4494
4495 /* Check that early_node_map is large enough */
4496 if (i >= MAX_ACTIVE_REGIONS) {
4497 printk(KERN_CRIT "More than %d memory regions, truncating\n",
4498 MAX_ACTIVE_REGIONS);
4499 return;
4500 }
4501
4502 early_node_map[i].nid = nid;
4503 early_node_map[i].start_pfn = start_pfn;
4504 early_node_map[i].end_pfn = end_pfn;
4505 nr_nodemap_entries = i + 1;
4506}
4507
4508/**
4509 * remove_active_range - Shrink an existing registered range of PFNs
4510 * @nid: The node id the range is on that should be shrunk
4511 * @start_pfn: The new PFN of the range
4512 * @end_pfn: The new PFN of the range
4513 *
4514 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
4515 * The map is kept near the end physical page range that has already been
4516 * registered. This function allows an arch to shrink an existing registered
4517 * range.
4518 */
4519void __init remove_active_range(unsigned int nid, unsigned long start_pfn,
4520 unsigned long end_pfn)
4521{
4522 int i, j;
4523 int removed = 0;
4524
4525 printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n",
4526 nid, start_pfn, end_pfn);
4527
4528 /* Find the old active region end and shrink */
4529 for_each_active_range_index_in_nid(i, nid) {
4530 if (early_node_map[i].start_pfn >= start_pfn &&
4531 early_node_map[i].end_pfn <= end_pfn) {
4532 /* clear it */
4533 early_node_map[i].start_pfn = 0;
4534 early_node_map[i].end_pfn = 0;
4535 removed = 1;
4536 continue;
4537 }
4538 if (early_node_map[i].start_pfn < start_pfn &&
4539 early_node_map[i].end_pfn > start_pfn) {
4540 unsigned long temp_end_pfn = early_node_map[i].end_pfn;
4541 early_node_map[i].end_pfn = start_pfn;
4542 if (temp_end_pfn > end_pfn)
4543 add_active_range(nid, end_pfn, temp_end_pfn);
4544 continue;
4545 }
4546 if (early_node_map[i].start_pfn >= start_pfn &&
4547 early_node_map[i].end_pfn > end_pfn &&
4548 early_node_map[i].start_pfn < end_pfn) {
4549 early_node_map[i].start_pfn = end_pfn;
4550 continue;
4551 }
4552 }
4553
4554 if (!removed)
4555 return;
4556
4557 /* remove the blank ones */
4558 for (i = nr_nodemap_entries - 1; i > 0; i--) {
4559 if (early_node_map[i].nid != nid)
4560 continue;
4561 if (early_node_map[i].end_pfn)
4562 continue;
4563 /* we found it, get rid of it */
4564 for (j = i; j < nr_nodemap_entries - 1; j++)
4565 memcpy(&early_node_map[j], &early_node_map[j+1],
4566 sizeof(early_node_map[j]));
4567 j = nr_nodemap_entries - 1;
4568 memset(&early_node_map[j], 0, sizeof(early_node_map[j]));
4569 nr_nodemap_entries--;
4570 }
4571}
4572
4573/**
4574 * remove_all_active_ranges - Remove all currently registered regions
4575 *
4576 * During discovery, it may be found that a table like SRAT is invalid
4577 * and an alternative discovery method must be used. This function removes
4578 * all currently registered regions.
4579 */
4580void __init remove_all_active_ranges(void)
4581{
4582 memset(early_node_map, 0, sizeof(early_node_map));
4583 nr_nodemap_entries = 0;
4584}
4585
4586/* Compare two active node_active_regions */
4587static int __init cmp_node_active_region(const void *a, const void *b)
4588{
4589 struct node_active_region *arange = (struct node_active_region *)a;
4590 struct node_active_region *brange = (struct node_active_region *)b;
4591
4592 /* Done this way to avoid overflows */
4593 if (arange->start_pfn > brange->start_pfn)
4594 return 1;
4595 if (arange->start_pfn < brange->start_pfn)
4596 return -1;
4597
4598 return 0;
4599}
4600
4601/* sort the node_map by start_pfn */
4602void __init sort_node_map(void)
4603{
4604 sort(early_node_map, (size_t)nr_nodemap_entries,
4605 sizeof(struct node_active_region),
4606 cmp_node_active_region, NULL);
4607}
4608
4609/**
4610 * node_map_pfn_alignment - determine the maximum internode alignment 4418 * node_map_pfn_alignment - determine the maximum internode alignment
4611 * 4419 *
4612 * This function should be called after node map is populated and sorted. 4420 * This function should be called after node map is populated and sorted.
@@ -4628,15 +4436,11 @@ void __init sort_node_map(void)
4628unsigned long __init node_map_pfn_alignment(void) 4436unsigned long __init node_map_pfn_alignment(void)
4629{ 4437{
4630 unsigned long accl_mask = 0, last_end = 0; 4438 unsigned long accl_mask = 0, last_end = 0;
4439 unsigned long start, end, mask;
4631 int last_nid = -1; 4440 int last_nid = -1;
4632 int i; 4441 int i, nid;
4633
4634 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4635 int nid = early_node_map[i].nid;
4636 unsigned long start = early_node_map[i].start_pfn;
4637 unsigned long end = early_node_map[i].end_pfn;
4638 unsigned long mask;
4639 4442
4443 for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
4640 if (!start || last_nid < 0 || last_nid == nid) { 4444 if (!start || last_nid < 0 || last_nid == nid) {
4641 last_nid = nid; 4445 last_nid = nid;
4642 last_end = end; 4446 last_end = end;
@@ -4663,12 +4467,12 @@ unsigned long __init node_map_pfn_alignment(void)
4663/* Find the lowest pfn for a node */ 4467/* Find the lowest pfn for a node */
4664static unsigned long __init find_min_pfn_for_node(int nid) 4468static unsigned long __init find_min_pfn_for_node(int nid)
4665{ 4469{
4666 int i;
4667 unsigned long min_pfn = ULONG_MAX; 4470 unsigned long min_pfn = ULONG_MAX;
4471 unsigned long start_pfn;
4472 int i;
4668 4473
4669 /* Assuming a sorted map, the first range found has the starting pfn */ 4474 for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
4670 for_each_active_range_index_in_nid(i, nid) 4475 min_pfn = min(min_pfn, start_pfn);
4671 min_pfn = min(min_pfn, early_node_map[i].start_pfn);
4672 4476
4673 if (min_pfn == ULONG_MAX) { 4477 if (min_pfn == ULONG_MAX) {
4674 printk(KERN_WARNING 4478 printk(KERN_WARNING
@@ -4697,15 +4501,16 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4697 */ 4501 */
4698static unsigned long __init early_calculate_totalpages(void) 4502static unsigned long __init early_calculate_totalpages(void)
4699{ 4503{
4700 int i;
4701 unsigned long totalpages = 0; 4504 unsigned long totalpages = 0;
4505 unsigned long start_pfn, end_pfn;
4506 int i, nid;
4507
4508 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
4509 unsigned long pages = end_pfn - start_pfn;
4702 4510
4703 for (i = 0; i < nr_nodemap_entries; i++) {
4704 unsigned long pages = early_node_map[i].end_pfn -
4705 early_node_map[i].start_pfn;
4706 totalpages += pages; 4511 totalpages += pages;
4707 if (pages) 4512 if (pages)
4708 node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); 4513 node_set_state(nid, N_HIGH_MEMORY);
4709 } 4514 }
4710 return totalpages; 4515 return totalpages;
4711} 4516}
@@ -4760,6 +4565,8 @@ restart:
4760 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4565 /* Spread kernelcore memory as evenly as possible throughout nodes */
4761 kernelcore_node = required_kernelcore / usable_nodes; 4566 kernelcore_node = required_kernelcore / usable_nodes;
4762 for_each_node_state(nid, N_HIGH_MEMORY) { 4567 for_each_node_state(nid, N_HIGH_MEMORY) {
4568 unsigned long start_pfn, end_pfn;
4569
4763 /* 4570 /*
4764 * Recalculate kernelcore_node if the division per node 4571 * Recalculate kernelcore_node if the division per node
4765 * now exceeds what is necessary to satisfy the requested 4572 * now exceeds what is necessary to satisfy the requested
@@ -4776,13 +4583,10 @@ restart:
4776 kernelcore_remaining = kernelcore_node; 4583 kernelcore_remaining = kernelcore_node;
4777 4584
4778 /* Go through each range of PFNs within this node */ 4585 /* Go through each range of PFNs within this node */
4779 for_each_active_range_index_in_nid(i, nid) { 4586 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
4780 unsigned long start_pfn, end_pfn;
4781 unsigned long size_pages; 4587 unsigned long size_pages;
4782 4588
4783 start_pfn = max(early_node_map[i].start_pfn, 4589 start_pfn = max(start_pfn, zone_movable_pfn[nid]);
4784 zone_movable_pfn[nid]);
4785 end_pfn = early_node_map[i].end_pfn;
4786 if (start_pfn >= end_pfn) 4590 if (start_pfn >= end_pfn)
4787 continue; 4591 continue;
4788 4592
@@ -4863,8 +4667,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4863 4667
4864 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4668 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4865 struct zone *zone = &pgdat->node_zones[zone_type]; 4669 struct zone *zone = &pgdat->node_zones[zone_type];
4866 if (zone->present_pages) 4670 if (zone->present_pages) {
4867 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4671 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
4672 break;
4673 }
4868 } 4674 }
4869#endif 4675#endif
4870} 4676}
@@ -4884,11 +4690,8 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4884 */ 4690 */
4885void __init free_area_init_nodes(unsigned long *max_zone_pfn) 4691void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4886{ 4692{
4887 unsigned long nid; 4693 unsigned long start_pfn, end_pfn;
4888 int i; 4694 int i, nid;
4889
4890 /* Sort early_node_map as initialisation assumes it is sorted */
4891 sort_node_map();
4892 4695
4893 /* Record where the zone boundaries are */ 4696 /* Record where the zone boundaries are */
4894 memset(arch_zone_lowest_possible_pfn, 0, 4697 memset(arch_zone_lowest_possible_pfn, 0,
@@ -4935,11 +4738,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4935 } 4738 }
4936 4739
4937 /* Print out the early_node_map[] */ 4740 /* Print out the early_node_map[] */
4938 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 4741 printk("Early memory PFN ranges\n");
4939 for (i = 0; i < nr_nodemap_entries; i++) 4742 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4940 printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, 4743 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
4941 early_node_map[i].start_pfn,
4942 early_node_map[i].end_pfn);
4943 4744
4944 /* Initialise every node */ 4745 /* Initialise every node */
4945 mminit_verify_pageflags_layout(); 4746 mminit_verify_pageflags_layout();
@@ -4992,7 +4793,7 @@ static int __init cmdline_parse_movablecore(char *p)
4992early_param("kernelcore", cmdline_parse_kernelcore); 4793early_param("kernelcore", cmdline_parse_kernelcore);
4993early_param("movablecore", cmdline_parse_movablecore); 4794early_param("movablecore", cmdline_parse_movablecore);
4994 4795
4995#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 4796#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4996 4797
4997/** 4798/**
4998 * set_dma_reserve - set the specified number of pages reserved in the first zone 4799 * set_dma_reserve - set the specified number of pages reserved in the first zone
@@ -5076,8 +4877,19 @@ static void calculate_totalreserve_pages(void)
5076 if (max > zone->present_pages) 4877 if (max > zone->present_pages)
5077 max = zone->present_pages; 4878 max = zone->present_pages;
5078 reserve_pages += max; 4879 reserve_pages += max;
4880 /*
4881 * Lowmem reserves are not available to
4882 * GFP_HIGHUSER page cache allocations and
4883 * kswapd tries to balance zones to their high
4884 * watermark. As a result, neither should be
4885 * regarded as dirtyable memory, to prevent a
4886 * situation where reclaim has to clean pages
4887 * in order to balance the zones.
4888 */
4889 zone->dirty_balance_reserve = max;
5079 } 4890 }
5080 } 4891 }
4892 dirty_balance_reserve = reserve_pages;
5081 totalreserve_pages = reserve_pages; 4893 totalreserve_pages = reserve_pages;
5082} 4894}
5083 4895
@@ -5601,7 +5413,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5601 5413
5602bool is_pageblock_removable_nolock(struct page *page) 5414bool is_pageblock_removable_nolock(struct page *page)
5603{ 5415{
5604 struct zone *zone = page_zone(page); 5416 struct zone *zone;
5417 unsigned long pfn;
5418
5419 /*
5420 * We have to be careful here because we are iterating over memory
5421 * sections which are not zone aware so we might end up outside of
5422 * the zone but still within the section.
5423 * We have to take care about the node as well. If the node is offline
5424 * its NODE_DATA will be NULL - see page_zone.
5425 */
5426 if (!node_online(page_to_nid(page)))
5427 return false;
5428
5429 zone = page_zone(page);
5430 pfn = page_to_pfn(page);
5431 if (zone->zone_start_pfn > pfn ||
5432 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5433 return false;
5434
5605 return __count_immobile_pages(zone, page, 0); 5435 return __count_immobile_pages(zone, page, 0);
5606} 5436}
5607 5437