aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-02-14 05:55:18 -0500
committerIngo Molnar <mingo@elte.hu>2011-02-14 05:55:18 -0500
commitd2137d5af4259f50c19addb8246a186c9ffac325 (patch)
tree2f7e309f9cf8ef2f2698532c226edda38021fe69 /mm/page_alloc.c
parentf005fe12b90c5b9fe180a09209a893e09affa8aa (diff)
parent795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
Merge branch 'linus' into x86/bootmem
Conflicts: arch/x86/mm/numa_64.c Merge reason: fix the conflict, update to latest -rc and pick up this dependent fix from Yinghai: e6d2e2b2b1e1: memblock: don't adjust size in memblock_find_base() Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c216
1 files changed, 138 insertions, 78 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 19413bfdef92..887ce3bd823d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
104 * only be modified with pm_mutex held, unless the suspend/hibernate code is 104 * only be modified with pm_mutex held, unless the suspend/hibernate code is
105 * guaranteed not to run in parallel with that modification). 105 * guaranteed not to run in parallel with that modification).
106 */ 106 */
107void set_gfp_allowed_mask(gfp_t mask) 107
108static gfp_t saved_gfp_mask;
109
110void pm_restore_gfp_mask(void)
108{ 111{
109 WARN_ON(!mutex_is_locked(&pm_mutex)); 112 WARN_ON(!mutex_is_locked(&pm_mutex));
110 gfp_allowed_mask = mask; 113 if (saved_gfp_mask) {
114 gfp_allowed_mask = saved_gfp_mask;
115 saved_gfp_mask = 0;
116 }
111} 117}
112 118
113gfp_t clear_gfp_allowed_mask(gfp_t mask) 119void pm_restrict_gfp_mask(void)
114{ 120{
115 gfp_t ret = gfp_allowed_mask;
116
117 WARN_ON(!mutex_is_locked(&pm_mutex)); 121 WARN_ON(!mutex_is_locked(&pm_mutex));
118 gfp_allowed_mask &= ~mask; 122 WARN_ON(saved_gfp_mask);
119 return ret; 123 saved_gfp_mask = gfp_allowed_mask;
124 gfp_allowed_mask &= ~GFP_IOFS;
120} 125}
121#endif /* CONFIG_PM_SLEEP */ 126#endif /* CONFIG_PM_SLEEP */
122 127
@@ -352,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
352 } 357 }
353} 358}
354 359
360/* update __split_huge_page_refcount if you change this function */
355static int destroy_compound_page(struct page *page, unsigned long order) 361static int destroy_compound_page(struct page *page, unsigned long order)
356{ 362{
357 int i; 363 int i;
@@ -421,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
421 * 427 *
422 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 428 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
423 */ 429 */
424static inline struct page *
425__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
426{
427 unsigned long buddy_idx = page_idx ^ (1 << order);
428
429 return page + (buddy_idx - page_idx);
430}
431
432static inline unsigned long 430static inline unsigned long
433__find_combined_index(unsigned long page_idx, unsigned int order) 431__find_buddy_index(unsigned long page_idx, unsigned int order)
434{ 432{
435 return (page_idx & ~(1 << order)); 433 return page_idx ^ (1 << order);
436} 434}
437 435
438/* 436/*
@@ -443,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
443 * (c) a page and its buddy have the same order && 441 * (c) a page and its buddy have the same order &&
444 * (d) a page and its buddy are in the same zone. 442 * (d) a page and its buddy are in the same zone.
445 * 443 *
446 * For recording whether a page is in the buddy system, we use PG_buddy. 444 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
447 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 445 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
448 * 446 *
449 * For recording page's order, we use page_private(page). 447 * For recording page's order, we use page_private(page).
450 */ 448 */
@@ -477,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
477 * as necessary, plus some accounting needed to play nicely with other 475 * as necessary, plus some accounting needed to play nicely with other
478 * parts of the VM system. 476 * parts of the VM system.
479 * At each level, we keep a list of pages, which are heads of continuous 477 * At each level, we keep a list of pages, which are heads of continuous
480 * free pages of length of (1 << order) and marked with PG_buddy. Page's 478 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
481 * order is recorded in page_private(page) field. 479 * order is recorded in page_private(page) field.
482 * So when we are allocating or freeing one, we can derive the state of the 480 * So when we are allocating or freeing one, we can derive the state of the
483 * other. That is, if we allocate a small block, and both were 481 * other. That is, if we allocate a small block, and both were
@@ -494,6 +492,7 @@ static inline void __free_one_page(struct page *page,
494{ 492{
495 unsigned long page_idx; 493 unsigned long page_idx;
496 unsigned long combined_idx; 494 unsigned long combined_idx;
495 unsigned long uninitialized_var(buddy_idx);
497 struct page *buddy; 496 struct page *buddy;
498 497
499 if (unlikely(PageCompound(page))) 498 if (unlikely(PageCompound(page)))
@@ -508,7 +507,8 @@ static inline void __free_one_page(struct page *page,
508 VM_BUG_ON(bad_range(zone, page)); 507 VM_BUG_ON(bad_range(zone, page));
509 508
510 while (order < MAX_ORDER-1) { 509 while (order < MAX_ORDER-1) {
511 buddy = __page_find_buddy(page, page_idx, order); 510 buddy_idx = __find_buddy_index(page_idx, order);
511 buddy = page + (buddy_idx - page_idx);
512 if (!page_is_buddy(page, buddy, order)) 512 if (!page_is_buddy(page, buddy, order))
513 break; 513 break;
514 514
@@ -516,7 +516,7 @@ static inline void __free_one_page(struct page *page,
516 list_del(&buddy->lru); 516 list_del(&buddy->lru);
517 zone->free_area[order].nr_free--; 517 zone->free_area[order].nr_free--;
518 rmv_page_order(buddy); 518 rmv_page_order(buddy);
519 combined_idx = __find_combined_index(page_idx, order); 519 combined_idx = buddy_idx & page_idx;
520 page = page + (combined_idx - page_idx); 520 page = page + (combined_idx - page_idx);
521 page_idx = combined_idx; 521 page_idx = combined_idx;
522 order++; 522 order++;
@@ -533,9 +533,10 @@ static inline void __free_one_page(struct page *page,
533 */ 533 */
534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
535 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
536 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = buddy_idx & page_idx;
537 higher_page = page + combined_idx - page_idx; 537 higher_page = page + (combined_idx - page_idx);
538 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 538 buddy_idx = __find_buddy_index(combined_idx, order + 1);
539 higher_buddy = page + (buddy_idx - combined_idx);
539 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 540 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
540 list_add_tail(&page->lru, 541 list_add_tail(&page->lru,
541 &zone->free_area[order].free_list[migratetype]); 542 &zone->free_area[order].free_list[migratetype]);
@@ -646,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
646 trace_mm_page_free_direct(page, order); 647 trace_mm_page_free_direct(page, order);
647 kmemcheck_free_shadow(page, order); 648 kmemcheck_free_shadow(page, order);
648 649
649 for (i = 0; i < (1 << order); i++) { 650 if (PageAnon(page))
650 struct page *pg = page + i; 651 page->mapping = NULL;
651 652 for (i = 0; i < (1 << order); i++)
652 if (PageAnon(pg)) 653 bad += free_pages_check(page + i);
653 pg->mapping = NULL;
654 bad += free_pages_check(pg);
655 }
656 if (bad) 654 if (bad)
657 return false; 655 return false;
658 656
@@ -1090,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
1090 pset = per_cpu_ptr(zone->pageset, cpu); 1088 pset = per_cpu_ptr(zone->pageset, cpu);
1091 1089
1092 pcp = &pset->pcp; 1090 pcp = &pset->pcp;
1093 free_pcppages_bulk(zone, pcp->count, pcp); 1091 if (pcp->count) {
1094 pcp->count = 0; 1092 free_pcppages_bulk(zone, pcp->count, pcp);
1093 pcp->count = 0;
1094 }
1095 local_irq_restore(flags); 1095 local_irq_restore(flags);
1096 } 1096 }
1097} 1097}
@@ -1455,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1455#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1455#endif /* CONFIG_FAIL_PAGE_ALLOC */
1456 1456
1457/* 1457/*
1458 * Return 1 if free pages are above 'mark'. This takes into account the order 1458 * Return true if free pages are above 'mark'. This takes into account the order
1459 * of the allocation. 1459 * of the allocation.
1460 */ 1460 */
1461int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1461static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1462 int classzone_idx, int alloc_flags) 1462 int classzone_idx, int alloc_flags, long free_pages)
1463{ 1463{
1464 /* free_pages my go negative - that's OK */ 1464 /* free_pages my go negative - that's OK */
1465 long min = mark; 1465 long min = mark;
1466 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1467 int o; 1466 int o;
1468 1467
1468 free_pages -= (1 << order) + 1;
1469 if (alloc_flags & ALLOC_HIGH) 1469 if (alloc_flags & ALLOC_HIGH)
1470 min -= min / 2; 1470 min -= min / 2;
1471 if (alloc_flags & ALLOC_HARDER) 1471 if (alloc_flags & ALLOC_HARDER)
1472 min -= min / 4; 1472 min -= min / 4;
1473 1473
1474 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1474 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1475 return 0; 1475 return false;
1476 for (o = 0; o < order; o++) { 1476 for (o = 0; o < order; o++) {
1477 /* At the next order, this order's pages become unavailable */ 1477 /* At the next order, this order's pages become unavailable */
1478 free_pages -= z->free_area[o].nr_free << o; 1478 free_pages -= z->free_area[o].nr_free << o;
@@ -1481,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1481 min >>= 1; 1481 min >>= 1;
1482 1482
1483 if (free_pages <= min) 1483 if (free_pages <= min)
1484 return 0; 1484 return false;
1485 } 1485 }
1486 return 1; 1486 return true;
1487}
1488
1489bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1490 int classzone_idx, int alloc_flags)
1491{
1492 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1493 zone_page_state(z, NR_FREE_PAGES));
1494}
1495
1496bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1497 int classzone_idx, int alloc_flags)
1498{
1499 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1500
1501 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1502 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1503
1504 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1505 free_pages);
1487} 1506}
1488 1507
1489#ifdef CONFIG_NUMA 1508#ifdef CONFIG_NUMA
@@ -1788,15 +1807,18 @@ static struct page *
1788__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1807__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1789 struct zonelist *zonelist, enum zone_type high_zoneidx, 1808 struct zonelist *zonelist, enum zone_type high_zoneidx,
1790 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1809 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1791 int migratetype, unsigned long *did_some_progress) 1810 int migratetype, unsigned long *did_some_progress,
1811 bool sync_migration)
1792{ 1812{
1793 struct page *page; 1813 struct page *page;
1794 1814
1795 if (!order || compaction_deferred(preferred_zone)) 1815 if (!order || compaction_deferred(preferred_zone))
1796 return NULL; 1816 return NULL;
1797 1817
1818 current->flags |= PF_MEMALLOC;
1798 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1819 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1799 nodemask); 1820 nodemask, sync_migration);
1821 current->flags &= ~PF_MEMALLOC;
1800 if (*did_some_progress != COMPACT_SKIPPED) { 1822 if (*did_some_progress != COMPACT_SKIPPED) {
1801 1823
1802 /* Page migration frees to the PCP lists but we want merging */ 1824 /* Page migration frees to the PCP lists but we want merging */
@@ -1832,7 +1854,8 @@ static inline struct page *
1832__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1854__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1833 struct zonelist *zonelist, enum zone_type high_zoneidx, 1855 struct zonelist *zonelist, enum zone_type high_zoneidx,
1834 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1856 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1835 int migratetype, unsigned long *did_some_progress) 1857 int migratetype, unsigned long *did_some_progress,
1858 bool sync_migration)
1836{ 1859{
1837 return NULL; 1860 return NULL;
1838} 1861}
@@ -1847,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1847{ 1870{
1848 struct page *page = NULL; 1871 struct page *page = NULL;
1849 struct reclaim_state reclaim_state; 1872 struct reclaim_state reclaim_state;
1850 struct task_struct *p = current;
1851 bool drained = false; 1873 bool drained = false;
1852 1874
1853 cond_resched(); 1875 cond_resched();
1854 1876
1855 /* We now go into synchronous reclaim */ 1877 /* We now go into synchronous reclaim */
1856 cpuset_memory_pressure_bump(); 1878 cpuset_memory_pressure_bump();
1857 p->flags |= PF_MEMALLOC; 1879 current->flags |= PF_MEMALLOC;
1858 lockdep_set_current_reclaim_state(gfp_mask); 1880 lockdep_set_current_reclaim_state(gfp_mask);
1859 reclaim_state.reclaimed_slab = 0; 1881 reclaim_state.reclaimed_slab = 0;
1860 p->reclaim_state = &reclaim_state; 1882 current->reclaim_state = &reclaim_state;
1861 1883
1862 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1884 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1863 1885
1864 p->reclaim_state = NULL; 1886 current->reclaim_state = NULL;
1865 lockdep_clear_current_reclaim_state(); 1887 lockdep_clear_current_reclaim_state();
1866 p->flags &= ~PF_MEMALLOC; 1888 current->flags &= ~PF_MEMALLOC;
1867 1889
1868 cond_resched(); 1890 cond_resched();
1869 1891
@@ -1915,19 +1937,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1915 1937
1916static inline 1938static inline
1917void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 1939void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1918 enum zone_type high_zoneidx) 1940 enum zone_type high_zoneidx,
1941 enum zone_type classzone_idx)
1919{ 1942{
1920 struct zoneref *z; 1943 struct zoneref *z;
1921 struct zone *zone; 1944 struct zone *zone;
1922 1945
1923 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1946 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1924 wakeup_kswapd(zone, order); 1947 wakeup_kswapd(zone, order, classzone_idx);
1925} 1948}
1926 1949
1927static inline int 1950static inline int
1928gfp_to_alloc_flags(gfp_t gfp_mask) 1951gfp_to_alloc_flags(gfp_t gfp_mask)
1929{ 1952{
1930 struct task_struct *p = current;
1931 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 1953 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1932 const gfp_t wait = gfp_mask & __GFP_WAIT; 1954 const gfp_t wait = gfp_mask & __GFP_WAIT;
1933 1955
@@ -1943,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1943 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 1965 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1944 1966
1945 if (!wait) { 1967 if (!wait) {
1946 alloc_flags |= ALLOC_HARDER; 1968 /*
1969 * Not worth trying to allocate harder for
1970 * __GFP_NOMEMALLOC even if it can't schedule.
1971 */
1972 if (!(gfp_mask & __GFP_NOMEMALLOC))
1973 alloc_flags |= ALLOC_HARDER;
1947 /* 1974 /*
1948 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1975 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1949 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1976 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1950 */ 1977 */
1951 alloc_flags &= ~ALLOC_CPUSET; 1978 alloc_flags &= ~ALLOC_CPUSET;
1952 } else if (unlikely(rt_task(p)) && !in_interrupt()) 1979 } else if (unlikely(rt_task(current)) && !in_interrupt())
1953 alloc_flags |= ALLOC_HARDER; 1980 alloc_flags |= ALLOC_HARDER;
1954 1981
1955 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1982 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1956 if (!in_interrupt() && 1983 if (!in_interrupt() &&
1957 ((p->flags & PF_MEMALLOC) || 1984 ((current->flags & PF_MEMALLOC) ||
1958 unlikely(test_thread_flag(TIF_MEMDIE)))) 1985 unlikely(test_thread_flag(TIF_MEMDIE))))
1959 alloc_flags |= ALLOC_NO_WATERMARKS; 1986 alloc_flags |= ALLOC_NO_WATERMARKS;
1960 } 1987 }
@@ -1973,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1973 int alloc_flags; 2000 int alloc_flags;
1974 unsigned long pages_reclaimed = 0; 2001 unsigned long pages_reclaimed = 0;
1975 unsigned long did_some_progress; 2002 unsigned long did_some_progress;
1976 struct task_struct *p = current; 2003 bool sync_migration = false;
1977 2004
1978 /* 2005 /*
1979 * In the slowpath, we sanity check order to avoid ever trying to 2006 * In the slowpath, we sanity check order to avoid ever trying to
@@ -1998,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1998 goto nopage; 2025 goto nopage;
1999 2026
2000restart: 2027restart:
2001 wake_all_kswapd(order, zonelist, high_zoneidx); 2028 if (!(gfp_mask & __GFP_NO_KSWAPD))
2029 wake_all_kswapd(order, zonelist, high_zoneidx,
2030 zone_idx(preferred_zone));
2002 2031
2003 /* 2032 /*
2004 * OK, we're below the kswapd watermark and have kicked background 2033 * OK, we're below the kswapd watermark and have kicked background
@@ -2007,6 +2036,14 @@ restart:
2007 */ 2036 */
2008 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2037 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2009 2038
2039 /*
2040 * Find the true preferred zone if the allocation is unconstrained by
2041 * cpusets.
2042 */
2043 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2044 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2045 &preferred_zone);
2046
2010 /* This is the last chance, in general, before the goto nopage. */ 2047 /* This is the last chance, in general, before the goto nopage. */
2011 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2048 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2012 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2049 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2029,21 +2066,26 @@ rebalance:
2029 goto nopage; 2066 goto nopage;
2030 2067
2031 /* Avoid recursion of direct reclaim */ 2068 /* Avoid recursion of direct reclaim */
2032 if (p->flags & PF_MEMALLOC) 2069 if (current->flags & PF_MEMALLOC)
2033 goto nopage; 2070 goto nopage;
2034 2071
2035 /* Avoid allocations with no watermarks from looping endlessly */ 2072 /* Avoid allocations with no watermarks from looping endlessly */
2036 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2073 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2037 goto nopage; 2074 goto nopage;
2038 2075
2039 /* Try direct compaction */ 2076 /*
2077 * Try direct compaction. The first pass is asynchronous. Subsequent
2078 * attempts after direct reclaim are synchronous
2079 */
2040 page = __alloc_pages_direct_compact(gfp_mask, order, 2080 page = __alloc_pages_direct_compact(gfp_mask, order,
2041 zonelist, high_zoneidx, 2081 zonelist, high_zoneidx,
2042 nodemask, 2082 nodemask,
2043 alloc_flags, preferred_zone, 2083 alloc_flags, preferred_zone,
2044 migratetype, &did_some_progress); 2084 migratetype, &did_some_progress,
2085 sync_migration);
2045 if (page) 2086 if (page)
2046 goto got_pg; 2087 goto got_pg;
2088 sync_migration = true;
2047 2089
2048 /* Try direct reclaim and then allocating */ 2090 /* Try direct reclaim and then allocating */
2049 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2091 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2097,13 +2139,27 @@ rebalance:
2097 /* Wait for some write requests to complete then retry */ 2139 /* Wait for some write requests to complete then retry */
2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2140 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2099 goto rebalance; 2141 goto rebalance;
2142 } else {
2143 /*
2144 * High-order allocations do not necessarily loop after
2145 * direct reclaim and reclaim/compaction depends on compaction
2146 * being called after reclaim so call directly if necessary
2147 */
2148 page = __alloc_pages_direct_compact(gfp_mask, order,
2149 zonelist, high_zoneidx,
2150 nodemask,
2151 alloc_flags, preferred_zone,
2152 migratetype, &did_some_progress,
2153 sync_migration);
2154 if (page)
2155 goto got_pg;
2100 } 2156 }
2101 2157
2102nopage: 2158nopage:
2103 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2159 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2104 printk(KERN_WARNING "%s: page allocation failure." 2160 printk(KERN_WARNING "%s: page allocation failure."
2105 " order:%d, mode:0x%x\n", 2161 " order:%d, mode:0x%x\n",
2106 p->comm, order, gfp_mask); 2162 current->comm, order, gfp_mask);
2107 dump_stack(); 2163 dump_stack();
2108 show_mem(); 2164 show_mem();
2109 } 2165 }
@@ -2146,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2146 2202
2147 get_mems_allowed(); 2203 get_mems_allowed();
2148 /* The preferred zone is used for statistics later */ 2204 /* The preferred zone is used for statistics later */
2149 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2205 first_zones_zonelist(zonelist, high_zoneidx,
2206 nodemask ? : &cpuset_current_mems_allowed,
2207 &preferred_zone);
2150 if (!preferred_zone) { 2208 if (!preferred_zone) {
2151 put_mems_allowed(); 2209 put_mems_allowed();
2152 return NULL; 2210 return NULL;
@@ -2437,7 +2495,7 @@ void show_free_areas(void)
2437 " all_unreclaimable? %s" 2495 " all_unreclaimable? %s"
2438 "\n", 2496 "\n",
2439 zone->name, 2497 zone->name,
2440 K(zone_nr_free_pages(zone)), 2498 K(zone_page_state(zone, NR_FREE_PAGES)),
2441 K(min_wmark_pages(zone)), 2499 K(min_wmark_pages(zone)),
2442 K(low_wmark_pages(zone)), 2500 K(low_wmark_pages(zone)),
2443 K(high_wmark_pages(zone)), 2501 K(high_wmark_pages(zone)),
@@ -2580,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
2580 2638
2581static __init int setup_numa_zonelist_order(char *s) 2639static __init int setup_numa_zonelist_order(char *s)
2582{ 2640{
2583 if (s) 2641 int ret;
2584 return __parse_numa_zonelist_order(s); 2642
2585 return 0; 2643 if (!s)
2644 return 0;
2645
2646 ret = __parse_numa_zonelist_order(s);
2647 if (ret == 0)
2648 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2649
2650 return ret;
2586} 2651}
2587early_param("numa_zonelist_order", setup_numa_zonelist_order); 2652early_param("numa_zonelist_order", setup_numa_zonelist_order);
2588 2653
@@ -3008,14 +3073,6 @@ static __init_refok int __build_all_zonelists(void *data)
3008 build_zonelist_cache(pgdat); 3073 build_zonelist_cache(pgdat);
3009 } 3074 }
3010 3075
3011#ifdef CONFIG_MEMORY_HOTPLUG
3012 /* Setup real pagesets for the new zone */
3013 if (data) {
3014 struct zone *zone = data;
3015 setup_zone_pageset(zone);
3016 }
3017#endif
3018
3019 /* 3076 /*
3020 * Initialize the boot_pagesets that are going to be used 3077 * Initialize the boot_pagesets that are going to be used
3021 * for bootstrapping processors. The real pagesets for 3078 * for bootstrapping processors. The real pagesets for
@@ -3064,7 +3121,11 @@ void build_all_zonelists(void *data)
3064 } else { 3121 } else {
3065 /* we have to stop all cpus to guarantee there is no user 3122 /* we have to stop all cpus to guarantee there is no user
3066 of zonelist */ 3123 of zonelist */
3067 stop_machine(__build_all_zonelists, data, NULL); 3124#ifdef CONFIG_MEMORY_HOTPLUG
3125 if (data)
3126 setup_zone_pageset((struct zone *)data);
3127#endif
3128 stop_machine(__build_all_zonelists, NULL, NULL);
3068 /* cpuset refresh routine should be here */ 3129 /* cpuset refresh routine should be here */
3069 } 3130 }
3070 vm_total_pages = nr_free_pagecache_pages(); 3131 vm_total_pages = nr_free_pagecache_pages();
@@ -4045,7 +4106,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4045 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4106 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
4046} 4107}
4047#else 4108#else
4048static void inline setup_usemap(struct pglist_data *pgdat, 4109static inline void setup_usemap(struct pglist_data *pgdat,
4049 struct zone *zone, unsigned long zonesize) {} 4110 struct zone *zone, unsigned long zonesize) {}
4050#endif /* CONFIG_SPARSEMEM */ 4111#endif /* CONFIG_SPARSEMEM */
4051 4112
@@ -5548,7 +5609,6 @@ static struct trace_print_flags pageflag_names[] = {
5548 {1UL << PG_swapcache, "swapcache" }, 5609 {1UL << PG_swapcache, "swapcache" },
5549 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5610 {1UL << PG_mappedtodisk, "mappedtodisk" },
5550 {1UL << PG_reclaim, "reclaim" }, 5611 {1UL << PG_reclaim, "reclaim" },
5551 {1UL << PG_buddy, "buddy" },
5552 {1UL << PG_swapbacked, "swapbacked" }, 5612 {1UL << PG_swapbacked, "swapbacked" },
5553 {1UL << PG_unevictable, "unevictable" }, 5613 {1UL << PG_unevictable, "unevictable" },
5554#ifdef CONFIG_MMU 5614#ifdef CONFIG_MMU
@@ -5596,7 +5656,7 @@ void dump_page(struct page *page)
5596{ 5656{
5597 printk(KERN_ALERT 5657 printk(KERN_ALERT
5598 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5658 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5599 page, page_count(page), page_mapcount(page), 5659 page, atomic_read(&page->_count), page_mapcount(page),
5600 page->mapping, page->index); 5660 page->mapping, page->index);
5601 dump_page_flags(page->flags); 5661 dump_page_flags(page->flags);
5602} 5662}