aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c629
1 files changed, 433 insertions, 196 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..4e8985acdab8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/jiffies.h> 22#include <linux/jiffies.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/compiler.h> 25#include <linux/compiler.h>
25#include <linux/kernel.h> 26#include <linux/kernel.h>
26#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
@@ -29,6 +30,7 @@
29#include <linux/pagevec.h> 30#include <linux/pagevec.h>
30#include <linux/blkdev.h> 31#include <linux/blkdev.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
33#include <linux/ratelimit.h>
32#include <linux/oom.h> 34#include <linux/oom.h>
33#include <linux/notifier.h> 35#include <linux/notifier.h>
34#include <linux/topology.h> 36#include <linux/topology.h>
@@ -38,6 +40,7 @@
38#include <linux/memory_hotplug.h> 40#include <linux/memory_hotplug.h>
39#include <linux/nodemask.h> 41#include <linux/nodemask.h>
40#include <linux/vmalloc.h> 42#include <linux/vmalloc.h>
43#include <linux/vmstat.h>
41#include <linux/mempolicy.h> 44#include <linux/mempolicy.h>
42#include <linux/stop_machine.h> 45#include <linux/stop_machine.h>
43#include <linux/sort.h> 46#include <linux/sort.h>
@@ -52,6 +55,8 @@
52#include <linux/compaction.h> 55#include <linux/compaction.h>
53#include <trace/events/kmem.h> 56#include <trace/events/kmem.h>
54#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h>
59#include <linux/prefetch.h>
55 60
56#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
57#include <asm/div64.h> 62#include <asm/div64.h>
@@ -103,19 +108,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
103 * only be modified with pm_mutex held, unless the suspend/hibernate code is 108 * only be modified with pm_mutex held, unless the suspend/hibernate code is
104 * guaranteed not to run in parallel with that modification). 109 * guaranteed not to run in parallel with that modification).
105 */ 110 */
106void set_gfp_allowed_mask(gfp_t mask) 111
112static gfp_t saved_gfp_mask;
113
114void pm_restore_gfp_mask(void)
107{ 115{
108 WARN_ON(!mutex_is_locked(&pm_mutex)); 116 WARN_ON(!mutex_is_locked(&pm_mutex));
109 gfp_allowed_mask = mask; 117 if (saved_gfp_mask) {
118 gfp_allowed_mask = saved_gfp_mask;
119 saved_gfp_mask = 0;
120 }
110} 121}
111 122
112gfp_t clear_gfp_allowed_mask(gfp_t mask) 123void pm_restrict_gfp_mask(void)
113{ 124{
114 gfp_t ret = gfp_allowed_mask;
115
116 WARN_ON(!mutex_is_locked(&pm_mutex)); 125 WARN_ON(!mutex_is_locked(&pm_mutex));
117 gfp_allowed_mask &= ~mask; 126 WARN_ON(saved_gfp_mask);
118 return ret; 127 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS;
119} 129}
120#endif /* CONFIG_PM_SLEEP */ 130#endif /* CONFIG_PM_SLEEP */
121 131
@@ -280,7 +290,7 @@ static void bad_page(struct page *page)
280 290
281 /* Don't complain about poisoned pages */ 291 /* Don't complain about poisoned pages */
282 if (PageHWPoison(page)) { 292 if (PageHWPoison(page)) {
283 __ClearPageBuddy(page); 293 reset_page_mapcount(page); /* remove PageBuddy */
284 return; 294 return;
285 } 295 }
286 296
@@ -311,7 +321,7 @@ static void bad_page(struct page *page)
311 dump_stack(); 321 dump_stack();
312out: 322out:
313 /* Leave bad fields for debug, except PageBuddy could make trouble */ 323 /* Leave bad fields for debug, except PageBuddy could make trouble */
314 __ClearPageBuddy(page); 324 reset_page_mapcount(page); /* remove PageBuddy */
315 add_taint(TAINT_BAD_PAGE); 325 add_taint(TAINT_BAD_PAGE);
316} 326}
317 327
@@ -351,6 +361,7 @@ void prep_compound_page(struct page *page, unsigned long order)
351 } 361 }
352} 362}
353 363
364/* update __split_huge_page_refcount if you change this function */
354static int destroy_compound_page(struct page *page, unsigned long order) 365static int destroy_compound_page(struct page *page, unsigned long order)
355{ 366{
356 int i; 367 int i;
@@ -420,18 +431,10 @@ static inline void rmv_page_order(struct page *page)
420 * 431 *
421 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 432 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
422 */ 433 */
423static inline struct page *
424__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
425{
426 unsigned long buddy_idx = page_idx ^ (1 << order);
427
428 return page + (buddy_idx - page_idx);
429}
430
431static inline unsigned long 434static inline unsigned long
432__find_combined_index(unsigned long page_idx, unsigned int order) 435__find_buddy_index(unsigned long page_idx, unsigned int order)
433{ 436{
434 return (page_idx & ~(1 << order)); 437 return page_idx ^ (1 << order);
435} 438}
436 439
437/* 440/*
@@ -442,8 +445,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
442 * (c) a page and its buddy have the same order && 445 * (c) a page and its buddy have the same order &&
443 * (d) a page and its buddy are in the same zone. 446 * (d) a page and its buddy are in the same zone.
444 * 447 *
445 * For recording whether a page is in the buddy system, we use PG_buddy. 448 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
446 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 449 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
447 * 450 *
448 * For recording page's order, we use page_private(page). 451 * For recording page's order, we use page_private(page).
449 */ 452 */
@@ -476,7 +479,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
476 * as necessary, plus some accounting needed to play nicely with other 479 * as necessary, plus some accounting needed to play nicely with other
477 * parts of the VM system. 480 * parts of the VM system.
478 * At each level, we keep a list of pages, which are heads of continuous 481 * At each level, we keep a list of pages, which are heads of continuous
479 * free pages of length of (1 << order) and marked with PG_buddy. Page's 482 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
480 * order is recorded in page_private(page) field. 483 * order is recorded in page_private(page) field.
481 * So when we are allocating or freeing one, we can derive the state of the 484 * So when we are allocating or freeing one, we can derive the state of the
482 * other. That is, if we allocate a small block, and both were 485 * other. That is, if we allocate a small block, and both were
@@ -493,6 +496,7 @@ static inline void __free_one_page(struct page *page,
493{ 496{
494 unsigned long page_idx; 497 unsigned long page_idx;
495 unsigned long combined_idx; 498 unsigned long combined_idx;
499 unsigned long uninitialized_var(buddy_idx);
496 struct page *buddy; 500 struct page *buddy;
497 501
498 if (unlikely(PageCompound(page))) 502 if (unlikely(PageCompound(page)))
@@ -507,7 +511,8 @@ static inline void __free_one_page(struct page *page,
507 VM_BUG_ON(bad_range(zone, page)); 511 VM_BUG_ON(bad_range(zone, page));
508 512
509 while (order < MAX_ORDER-1) { 513 while (order < MAX_ORDER-1) {
510 buddy = __page_find_buddy(page, page_idx, order); 514 buddy_idx = __find_buddy_index(page_idx, order);
515 buddy = page + (buddy_idx - page_idx);
511 if (!page_is_buddy(page, buddy, order)) 516 if (!page_is_buddy(page, buddy, order))
512 break; 517 break;
513 518
@@ -515,7 +520,7 @@ static inline void __free_one_page(struct page *page,
515 list_del(&buddy->lru); 520 list_del(&buddy->lru);
516 zone->free_area[order].nr_free--; 521 zone->free_area[order].nr_free--;
517 rmv_page_order(buddy); 522 rmv_page_order(buddy);
518 combined_idx = __find_combined_index(page_idx, order); 523 combined_idx = buddy_idx & page_idx;
519 page = page + (combined_idx - page_idx); 524 page = page + (combined_idx - page_idx);
520 page_idx = combined_idx; 525 page_idx = combined_idx;
521 order++; 526 order++;
@@ -530,11 +535,12 @@ static inline void __free_one_page(struct page *page,
530 * so it's less likely to be used soon and more likely to be merged 535 * so it's less likely to be used soon and more likely to be merged
531 * as a higher order page 536 * as a higher order page
532 */ 537 */
533 if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) { 538 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
534 struct page *higher_page, *higher_buddy; 539 struct page *higher_page, *higher_buddy;
535 combined_idx = __find_combined_index(page_idx, order); 540 combined_idx = buddy_idx & page_idx;
536 higher_page = page + combined_idx - page_idx; 541 higher_page = page + (combined_idx - page_idx);
537 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 542 buddy_idx = __find_buddy_index(combined_idx, order + 1);
543 higher_buddy = page + (buddy_idx - combined_idx);
538 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 544 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
539 list_add_tail(&page->lru, 545 list_add_tail(&page->lru,
540 &zone->free_area[order].free_list[migratetype]); 546 &zone->free_area[order].free_list[migratetype]);
@@ -563,7 +569,8 @@ static inline int free_pages_check(struct page *page)
563 if (unlikely(page_mapcount(page) | 569 if (unlikely(page_mapcount(page) |
564 (page->mapping != NULL) | 570 (page->mapping != NULL) |
565 (atomic_read(&page->_count) != 0) | 571 (atomic_read(&page->_count) != 0) |
566 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 572 (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
573 (mem_cgroup_bad_page_check(page)))) {
567 bad_page(page); 574 bad_page(page);
568 return 1; 575 return 1;
569 } 576 }
@@ -612,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
612 list = &pcp->lists[migratetype]; 619 list = &pcp->lists[migratetype];
613 } while (list_empty(list)); 620 } while (list_empty(list));
614 621
622 /* This is the only non-empty list. Free them all. */
623 if (batch_free == MIGRATE_PCPTYPES)
624 batch_free = to_free;
625
615 do { 626 do {
616 page = list_entry(list->prev, struct page, lru); 627 page = list_entry(list->prev, struct page, lru);
617 /* must delete as __free_one_page list manipulates */ 628 /* must delete as __free_one_page list manipulates */
@@ -645,13 +656,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
645 trace_mm_page_free_direct(page, order); 656 trace_mm_page_free_direct(page, order);
646 kmemcheck_free_shadow(page, order); 657 kmemcheck_free_shadow(page, order);
647 658
648 for (i = 0; i < (1 << order); i++) { 659 if (PageAnon(page))
649 struct page *pg = page + i; 660 page->mapping = NULL;
650 661 for (i = 0; i < (1 << order); i++)
651 if (PageAnon(pg)) 662 bad += free_pages_check(page + i);
652 pg->mapping = NULL;
653 bad += free_pages_check(pg);
654 }
655 if (bad) 663 if (bad)
656 return false; 664 return false;
657 665
@@ -751,7 +759,8 @@ static inline int check_new_page(struct page *page)
751 if (unlikely(page_mapcount(page) | 759 if (unlikely(page_mapcount(page) |
752 (page->mapping != NULL) | 760 (page->mapping != NULL) |
753 (atomic_read(&page->_count) != 0) | 761 (atomic_read(&page->_count) != 0) |
754 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 762 (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
763 (mem_cgroup_bad_page_check(page)))) {
755 bad_page(page); 764 bad_page(page);
756 return 1; 765 return 1;
757 } 766 }
@@ -864,9 +873,8 @@ static int move_freepages(struct zone *zone,
864 } 873 }
865 874
866 order = page_order(page); 875 order = page_order(page);
867 list_del(&page->lru); 876 list_move(&page->lru,
868 list_add(&page->lru, 877 &zone->free_area[order].free_list[migratetype]);
869 &zone->free_area[order].free_list[migratetype]);
870 page += 1 << order; 878 page += 1 << order;
871 pages_moved += 1 << order; 879 pages_moved += 1 << order;
872 } 880 }
@@ -937,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
937 * If breaking a large block of pages, move all free 945 * If breaking a large block of pages, move all free
938 * pages to the preferred allocation list. If falling 946 * pages to the preferred allocation list. If falling
939 * back for a reclaimable kernel allocation, be more 947 * back for a reclaimable kernel allocation, be more
940 * agressive about taking ownership of free pages 948 * aggressive about taking ownership of free pages
941 */ 949 */
942 if (unlikely(current_order >= (pageblock_order >> 1)) || 950 if (unlikely(current_order >= (pageblock_order >> 1)) ||
943 start_migratetype == MIGRATE_RECLAIMABLE || 951 start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1089,8 +1097,10 @@ static void drain_pages(unsigned int cpu)
1089 pset = per_cpu_ptr(zone->pageset, cpu); 1097 pset = per_cpu_ptr(zone->pageset, cpu);
1090 1098
1091 pcp = &pset->pcp; 1099 pcp = &pset->pcp;
1092 free_pcppages_bulk(zone, pcp->count, pcp); 1100 if (pcp->count) {
1093 pcp->count = 0; 1101 free_pcppages_bulk(zone, pcp->count, pcp);
1102 pcp->count = 0;
1103 }
1094 local_irq_restore(flags); 1104 local_irq_restore(flags);
1095 } 1105 }
1096} 1106}
@@ -1332,7 +1342,7 @@ again:
1332 } 1342 }
1333 1343
1334 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1344 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1335 zone_statistics(preferred_zone, zone); 1345 zone_statistics(preferred_zone, zone, gfp_flags);
1336 local_irq_restore(flags); 1346 local_irq_restore(flags);
1337 1347
1338 VM_BUG_ON(bad_range(zone, page)); 1348 VM_BUG_ON(bad_range(zone, page));
@@ -1454,24 +1464,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1454#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1464#endif /* CONFIG_FAIL_PAGE_ALLOC */
1455 1465
1456/* 1466/*
1457 * Return 1 if free pages are above 'mark'. This takes into account the order 1467 * Return true if free pages are above 'mark'. This takes into account the order
1458 * of the allocation. 1468 * of the allocation.
1459 */ 1469 */
1460int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1470static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1461 int classzone_idx, int alloc_flags) 1471 int classzone_idx, int alloc_flags, long free_pages)
1462{ 1472{
1463 /* free_pages my go negative - that's OK */ 1473 /* free_pages my go negative - that's OK */
1464 long min = mark; 1474 long min = mark;
1465 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1466 int o; 1475 int o;
1467 1476
1477 free_pages -= (1 << order) + 1;
1468 if (alloc_flags & ALLOC_HIGH) 1478 if (alloc_flags & ALLOC_HIGH)
1469 min -= min / 2; 1479 min -= min / 2;
1470 if (alloc_flags & ALLOC_HARDER) 1480 if (alloc_flags & ALLOC_HARDER)
1471 min -= min / 4; 1481 min -= min / 4;
1472 1482
1473 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1483 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1474 return 0; 1484 return false;
1475 for (o = 0; o < order; o++) { 1485 for (o = 0; o < order; o++) {
1476 /* At the next order, this order's pages become unavailable */ 1486 /* At the next order, this order's pages become unavailable */
1477 free_pages -= z->free_area[o].nr_free << o; 1487 free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1490,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1480 min >>= 1; 1490 min >>= 1;
1481 1491
1482 if (free_pages <= min) 1492 if (free_pages <= min)
1483 return 0; 1493 return false;
1484 } 1494 }
1485 return 1; 1495 return true;
1496}
1497
1498bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1499 int classzone_idx, int alloc_flags)
1500{
1501 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1502 zone_page_state(z, NR_FREE_PAGES));
1503}
1504
1505bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1506 int classzone_idx, int alloc_flags)
1507{
1508 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1509
1510 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1511 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1512
1513 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1514 free_pages);
1486} 1515}
1487 1516
1488#ifdef CONFIG_NUMA 1517#ifdef CONFIG_NUMA
@@ -1694,6 +1723,59 @@ try_next_zone:
1694 return page; 1723 return page;
1695} 1724}
1696 1725
1726/*
1727 * Large machines with many possible nodes should not always dump per-node
1728 * meminfo in irq context.
1729 */
1730static inline bool should_suppress_show_mem(void)
1731{
1732 bool ret = false;
1733
1734#if NODES_SHIFT > 8
1735 ret = in_interrupt();
1736#endif
1737 return ret;
1738}
1739
1740static DEFINE_RATELIMIT_STATE(nopage_rs,
1741 DEFAULT_RATELIMIT_INTERVAL,
1742 DEFAULT_RATELIMIT_BURST);
1743
1744void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1745{
1746 va_list args;
1747 unsigned int filter = SHOW_MEM_FILTER_NODES;
1748
1749 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
1750 return;
1751
1752 /*
1753 * This documents exceptions given to allocations in certain
1754 * contexts that are allowed to allocate outside current's set
1755 * of allowed nodes.
1756 */
1757 if (!(gfp_mask & __GFP_NOMEMALLOC))
1758 if (test_thread_flag(TIF_MEMDIE) ||
1759 (current->flags & (PF_MEMALLOC | PF_EXITING)))
1760 filter &= ~SHOW_MEM_FILTER_NODES;
1761 if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
1762 filter &= ~SHOW_MEM_FILTER_NODES;
1763
1764 if (fmt) {
1765 printk(KERN_WARNING);
1766 va_start(args, fmt);
1767 vprintk(fmt, args);
1768 va_end(args);
1769 }
1770
1771 pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
1772 current->comm, order, gfp_mask);
1773
1774 dump_stack();
1775 if (!should_suppress_show_mem())
1776 show_mem(filter);
1777}
1778
1697static inline int 1779static inline int
1698should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1780should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1699 unsigned long pages_reclaimed) 1781 unsigned long pages_reclaimed)
@@ -1787,15 +1869,18 @@ static struct page *
1787__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1869__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1788 struct zonelist *zonelist, enum zone_type high_zoneidx, 1870 struct zonelist *zonelist, enum zone_type high_zoneidx,
1789 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1871 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1790 int migratetype, unsigned long *did_some_progress) 1872 int migratetype, unsigned long *did_some_progress,
1873 bool sync_migration)
1791{ 1874{
1792 struct page *page; 1875 struct page *page;
1793 1876
1794 if (!order || compaction_deferred(preferred_zone)) 1877 if (!order || compaction_deferred(preferred_zone))
1795 return NULL; 1878 return NULL;
1796 1879
1880 current->flags |= PF_MEMALLOC;
1797 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1881 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1798 nodemask); 1882 nodemask, sync_migration);
1883 current->flags &= ~PF_MEMALLOC;
1799 if (*did_some_progress != COMPACT_SKIPPED) { 1884 if (*did_some_progress != COMPACT_SKIPPED) {
1800 1885
1801 /* Page migration frees to the PCP lists but we want merging */ 1886 /* Page migration frees to the PCP lists but we want merging */
@@ -1831,7 +1916,8 @@ static inline struct page *
1831__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1916__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1832 struct zonelist *zonelist, enum zone_type high_zoneidx, 1917 struct zonelist *zonelist, enum zone_type high_zoneidx,
1833 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1918 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1834 int migratetype, unsigned long *did_some_progress) 1919 int migratetype, unsigned long *did_some_progress,
1920 bool sync_migration)
1835{ 1921{
1836 return NULL; 1922 return NULL;
1837} 1923}
@@ -1846,23 +1932,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1846{ 1932{
1847 struct page *page = NULL; 1933 struct page *page = NULL;
1848 struct reclaim_state reclaim_state; 1934 struct reclaim_state reclaim_state;
1849 struct task_struct *p = current;
1850 bool drained = false; 1935 bool drained = false;
1851 1936
1852 cond_resched(); 1937 cond_resched();
1853 1938
1854 /* We now go into synchronous reclaim */ 1939 /* We now go into synchronous reclaim */
1855 cpuset_memory_pressure_bump(); 1940 cpuset_memory_pressure_bump();
1856 p->flags |= PF_MEMALLOC; 1941 current->flags |= PF_MEMALLOC;
1857 lockdep_set_current_reclaim_state(gfp_mask); 1942 lockdep_set_current_reclaim_state(gfp_mask);
1858 reclaim_state.reclaimed_slab = 0; 1943 reclaim_state.reclaimed_slab = 0;
1859 p->reclaim_state = &reclaim_state; 1944 current->reclaim_state = &reclaim_state;
1860 1945
1861 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1946 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1862 1947
1863 p->reclaim_state = NULL; 1948 current->reclaim_state = NULL;
1864 lockdep_clear_current_reclaim_state(); 1949 lockdep_clear_current_reclaim_state();
1865 p->flags &= ~PF_MEMALLOC; 1950 current->flags &= ~PF_MEMALLOC;
1866 1951
1867 cond_resched(); 1952 cond_resched();
1868 1953
@@ -1906,7 +1991,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1906 preferred_zone, migratetype); 1991 preferred_zone, migratetype);
1907 1992
1908 if (!page && gfp_mask & __GFP_NOFAIL) 1993 if (!page && gfp_mask & __GFP_NOFAIL)
1909 congestion_wait(BLK_RW_ASYNC, HZ/50); 1994 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
1910 } while (!page && (gfp_mask & __GFP_NOFAIL)); 1995 } while (!page && (gfp_mask & __GFP_NOFAIL));
1911 1996
1912 return page; 1997 return page;
@@ -1914,24 +1999,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1914 1999
1915static inline 2000static inline
1916void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2001void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1917 enum zone_type high_zoneidx) 2002 enum zone_type high_zoneidx,
2003 enum zone_type classzone_idx)
1918{ 2004{
1919 struct zoneref *z; 2005 struct zoneref *z;
1920 struct zone *zone; 2006 struct zone *zone;
1921 2007
1922 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2008 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1923 wakeup_kswapd(zone, order); 2009 wakeup_kswapd(zone, order, classzone_idx);
1924} 2010}
1925 2011
1926static inline int 2012static inline int
1927gfp_to_alloc_flags(gfp_t gfp_mask) 2013gfp_to_alloc_flags(gfp_t gfp_mask)
1928{ 2014{
1929 struct task_struct *p = current;
1930 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 2015 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1931 const gfp_t wait = gfp_mask & __GFP_WAIT; 2016 const gfp_t wait = gfp_mask & __GFP_WAIT;
1932 2017
1933 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ 2018 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1934 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); 2019 BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
1935 2020
1936 /* 2021 /*
1937 * The caller may dip into page reserves a bit more if the caller 2022 * The caller may dip into page reserves a bit more if the caller
@@ -1939,21 +2024,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1939 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 2024 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1940 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 2025 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1941 */ 2026 */
1942 alloc_flags |= (gfp_mask & __GFP_HIGH); 2027 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1943 2028
1944 if (!wait) { 2029 if (!wait) {
1945 alloc_flags |= ALLOC_HARDER; 2030 /*
2031 * Not worth trying to allocate harder for
2032 * __GFP_NOMEMALLOC even if it can't schedule.
2033 */
2034 if (!(gfp_mask & __GFP_NOMEMALLOC))
2035 alloc_flags |= ALLOC_HARDER;
1946 /* 2036 /*
1947 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 2037 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1948 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 2038 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1949 */ 2039 */
1950 alloc_flags &= ~ALLOC_CPUSET; 2040 alloc_flags &= ~ALLOC_CPUSET;
1951 } else if (unlikely(rt_task(p)) && !in_interrupt()) 2041 } else if (unlikely(rt_task(current)) && !in_interrupt())
1952 alloc_flags |= ALLOC_HARDER; 2042 alloc_flags |= ALLOC_HARDER;
1953 2043
1954 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2044 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1955 if (!in_interrupt() && 2045 if (!in_interrupt() &&
1956 ((p->flags & PF_MEMALLOC) || 2046 ((current->flags & PF_MEMALLOC) ||
1957 unlikely(test_thread_flag(TIF_MEMDIE)))) 2047 unlikely(test_thread_flag(TIF_MEMDIE))))
1958 alloc_flags |= ALLOC_NO_WATERMARKS; 2048 alloc_flags |= ALLOC_NO_WATERMARKS;
1959 } 2049 }
@@ -1972,7 +2062,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1972 int alloc_flags; 2062 int alloc_flags;
1973 unsigned long pages_reclaimed = 0; 2063 unsigned long pages_reclaimed = 0;
1974 unsigned long did_some_progress; 2064 unsigned long did_some_progress;
1975 struct task_struct *p = current; 2065 bool sync_migration = false;
1976 2066
1977 /* 2067 /*
1978 * In the slowpath, we sanity check order to avoid ever trying to 2068 * In the slowpath, we sanity check order to avoid ever trying to
@@ -1997,7 +2087,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1997 goto nopage; 2087 goto nopage;
1998 2088
1999restart: 2089restart:
2000 wake_all_kswapd(order, zonelist, high_zoneidx); 2090 if (!(gfp_mask & __GFP_NO_KSWAPD))
2091 wake_all_kswapd(order, zonelist, high_zoneidx,
2092 zone_idx(preferred_zone));
2001 2093
2002 /* 2094 /*
2003 * OK, we're below the kswapd watermark and have kicked background 2095 * OK, we're below the kswapd watermark and have kicked background
@@ -2006,6 +2098,15 @@ restart:
2006 */ 2098 */
2007 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2099 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2008 2100
2101 /*
2102 * Find the true preferred zone if the allocation is unconstrained by
2103 * cpusets.
2104 */
2105 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2106 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2107 &preferred_zone);
2108
2109rebalance:
2009 /* This is the last chance, in general, before the goto nopage. */ 2110 /* This is the last chance, in general, before the goto nopage. */
2010 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2111 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2011 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2112 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2013,7 +2114,6 @@ restart:
2013 if (page) 2114 if (page)
2014 goto got_pg; 2115 goto got_pg;
2015 2116
2016rebalance:
2017 /* Allocate without watermarks if the context allows */ 2117 /* Allocate without watermarks if the context allows */
2018 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2118 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2019 page = __alloc_pages_high_priority(gfp_mask, order, 2119 page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2028,21 +2128,26 @@ rebalance:
2028 goto nopage; 2128 goto nopage;
2029 2129
2030 /* Avoid recursion of direct reclaim */ 2130 /* Avoid recursion of direct reclaim */
2031 if (p->flags & PF_MEMALLOC) 2131 if (current->flags & PF_MEMALLOC)
2032 goto nopage; 2132 goto nopage;
2033 2133
2034 /* Avoid allocations with no watermarks from looping endlessly */ 2134 /* Avoid allocations with no watermarks from looping endlessly */
2035 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2135 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2036 goto nopage; 2136 goto nopage;
2037 2137
2038 /* Try direct compaction */ 2138 /*
2139 * Try direct compaction. The first pass is asynchronous. Subsequent
2140 * attempts after direct reclaim are synchronous
2141 */
2039 page = __alloc_pages_direct_compact(gfp_mask, order, 2142 page = __alloc_pages_direct_compact(gfp_mask, order,
2040 zonelist, high_zoneidx, 2143 zonelist, high_zoneidx,
2041 nodemask, 2144 nodemask,
2042 alloc_flags, preferred_zone, 2145 alloc_flags, preferred_zone,
2043 migratetype, &did_some_progress); 2146 migratetype, &did_some_progress,
2147 sync_migration);
2044 if (page) 2148 if (page)
2045 goto got_pg; 2149 goto got_pg;
2150 sync_migration = true;
2046 2151
2047 /* Try direct reclaim and then allocating */ 2152 /* Try direct reclaim and then allocating */
2048 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2153 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2094,18 +2199,26 @@ rebalance:
2094 pages_reclaimed += did_some_progress; 2199 pages_reclaimed += did_some_progress;
2095 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2200 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
2096 /* Wait for some write requests to complete then retry */ 2201 /* Wait for some write requests to complete then retry */
2097 congestion_wait(BLK_RW_ASYNC, HZ/50); 2202 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2098 goto rebalance; 2203 goto rebalance;
2204 } else {
2205 /*
2206 * High-order allocations do not necessarily loop after
2207 * direct reclaim and reclaim/compaction depends on compaction
2208 * being called after reclaim so call directly if necessary
2209 */
2210 page = __alloc_pages_direct_compact(gfp_mask, order,
2211 zonelist, high_zoneidx,
2212 nodemask,
2213 alloc_flags, preferred_zone,
2214 migratetype, &did_some_progress,
2215 sync_migration);
2216 if (page)
2217 goto got_pg;
2099 } 2218 }
2100 2219
2101nopage: 2220nopage:
2102 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2221 warn_alloc_failed(gfp_mask, order, NULL);
2103 printk(KERN_WARNING "%s: page allocation failure."
2104 " order:%d, mode:0x%x\n",
2105 p->comm, order, gfp_mask);
2106 dump_stack();
2107 show_mem();
2108 }
2109 return page; 2222 return page;
2110got_pg: 2223got_pg:
2111 if (kmemcheck_enabled) 2224 if (kmemcheck_enabled)
@@ -2145,7 +2258,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2145 2258
2146 get_mems_allowed(); 2259 get_mems_allowed();
2147 /* The preferred zone is used for statistics later */ 2260 /* The preferred zone is used for statistics later */
2148 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2261 first_zones_zonelist(zonelist, high_zoneidx,
2262 nodemask ? : &cpuset_current_mems_allowed,
2263 &preferred_zone);
2149 if (!preferred_zone) { 2264 if (!preferred_zone) {
2150 put_mems_allowed(); 2265 put_mems_allowed();
2151 return NULL; 2266 return NULL;
@@ -2224,6 +2339,21 @@ void free_pages(unsigned long addr, unsigned int order)
2224 2339
2225EXPORT_SYMBOL(free_pages); 2340EXPORT_SYMBOL(free_pages);
2226 2341
2342static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2343{
2344 if (addr) {
2345 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2346 unsigned long used = addr + PAGE_ALIGN(size);
2347
2348 split_page(virt_to_page((void *)addr), order);
2349 while (used < alloc_end) {
2350 free_page(used);
2351 used += PAGE_SIZE;
2352 }
2353 }
2354 return (void *)addr;
2355}
2356
2227/** 2357/**
2228 * alloc_pages_exact - allocate an exact number physically-contiguous pages. 2358 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
2229 * @size: the number of bytes to allocate 2359 * @size: the number of bytes to allocate
@@ -2243,22 +2373,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
2243 unsigned long addr; 2373 unsigned long addr;
2244 2374
2245 addr = __get_free_pages(gfp_mask, order); 2375 addr = __get_free_pages(gfp_mask, order);
2246 if (addr) { 2376 return make_alloc_exact(addr, order, size);
2247 unsigned long alloc_end = addr + (PAGE_SIZE << order);
2248 unsigned long used = addr + PAGE_ALIGN(size);
2249
2250 split_page(virt_to_page((void *)addr), order);
2251 while (used < alloc_end) {
2252 free_page(used);
2253 used += PAGE_SIZE;
2254 }
2255 }
2256
2257 return (void *)addr;
2258} 2377}
2259EXPORT_SYMBOL(alloc_pages_exact); 2378EXPORT_SYMBOL(alloc_pages_exact);
2260 2379
2261/** 2380/**
2381 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
2382 * pages on a node.
2383 * @nid: the preferred node ID where memory should be allocated
2384 * @size: the number of bytes to allocate
2385 * @gfp_mask: GFP flags for the allocation
2386 *
2387 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
2388 * back.
2389 * Note this is not alloc_pages_exact_node() which allocates on a specific node,
2390 * but is not exact.
2391 */
2392void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
2393{
2394 unsigned order = get_order(size);
2395 struct page *p = alloc_pages_node(nid, gfp_mask, order);
2396 if (!p)
2397 return NULL;
2398 return make_alloc_exact((unsigned long)page_address(p), order, size);
2399}
2400EXPORT_SYMBOL(alloc_pages_exact_nid);
2401
2402/**
2262 * free_pages_exact - release memory allocated via alloc_pages_exact() 2403 * free_pages_exact - release memory allocated via alloc_pages_exact()
2263 * @virt: the value returned by alloc_pages_exact. 2404 * @virt: the value returned by alloc_pages_exact.
2264 * @size: size of allocation, same value as passed to alloc_pages_exact(). 2405 * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2352,19 +2493,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2352} 2493}
2353#endif 2494#endif
2354 2495
2496/*
2497 * Determine whether the node should be displayed or not, depending on whether
2498 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
2499 */
2500bool skip_free_areas_node(unsigned int flags, int nid)
2501{
2502 bool ret = false;
2503
2504 if (!(flags & SHOW_MEM_FILTER_NODES))
2505 goto out;
2506
2507 get_mems_allowed();
2508 ret = !node_isset(nid, cpuset_current_mems_allowed);
2509 put_mems_allowed();
2510out:
2511 return ret;
2512}
2513
2355#define K(x) ((x) << (PAGE_SHIFT-10)) 2514#define K(x) ((x) << (PAGE_SHIFT-10))
2356 2515
2357/* 2516/*
2358 * Show free area list (used inside shift_scroll-lock stuff) 2517 * Show free area list (used inside shift_scroll-lock stuff)
2359 * We also calculate the percentage fragmentation. We do this by counting the 2518 * We also calculate the percentage fragmentation. We do this by counting the
2360 * memory on each free list with the exception of the first item on the list. 2519 * memory on each free list with the exception of the first item on the list.
2520 * Suppresses nodes that are not allowed by current's cpuset if
2521 * SHOW_MEM_FILTER_NODES is passed.
2361 */ 2522 */
2362void show_free_areas(void) 2523void show_free_areas(unsigned int filter)
2363{ 2524{
2364 int cpu; 2525 int cpu;
2365 struct zone *zone; 2526 struct zone *zone;
2366 2527
2367 for_each_populated_zone(zone) { 2528 for_each_populated_zone(zone) {
2529 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2530 continue;
2368 show_node(zone); 2531 show_node(zone);
2369 printk("%s per-cpu:\n", zone->name); 2532 printk("%s per-cpu:\n", zone->name);
2370 2533
@@ -2406,6 +2569,8 @@ void show_free_areas(void)
2406 for_each_populated_zone(zone) { 2569 for_each_populated_zone(zone) {
2407 int i; 2570 int i;
2408 2571
2572 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2573 continue;
2409 show_node(zone); 2574 show_node(zone);
2410 printk("%s" 2575 printk("%s"
2411 " free:%lukB" 2576 " free:%lukB"
@@ -2436,7 +2601,7 @@ void show_free_areas(void)
2436 " all_unreclaimable? %s" 2601 " all_unreclaimable? %s"
2437 "\n", 2602 "\n",
2438 zone->name, 2603 zone->name,
2439 K(zone_nr_free_pages(zone)), 2604 K(zone_page_state(zone, NR_FREE_PAGES)),
2440 K(min_wmark_pages(zone)), 2605 K(min_wmark_pages(zone)),
2441 K(low_wmark_pages(zone)), 2606 K(low_wmark_pages(zone)),
2442 K(high_wmark_pages(zone)), 2607 K(high_wmark_pages(zone)),
@@ -2473,6 +2638,8 @@ void show_free_areas(void)
2473 for_each_populated_zone(zone) { 2638 for_each_populated_zone(zone) {
2474 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2639 unsigned long nr[MAX_ORDER], flags, order, total = 0;
2475 2640
2641 if (skip_free_areas_node(filter, zone_to_nid(zone)))
2642 continue;
2476 show_node(zone); 2643 show_node(zone);
2477 printk("%s: ", zone->name); 2644 printk("%s: ", zone->name);
2478 2645
@@ -2579,9 +2746,16 @@ static int __parse_numa_zonelist_order(char *s)
2579 2746
2580static __init int setup_numa_zonelist_order(char *s) 2747static __init int setup_numa_zonelist_order(char *s)
2581{ 2748{
2582 if (s) 2749 int ret;
2583 return __parse_numa_zonelist_order(s); 2750
2584 return 0; 2751 if (!s)
2752 return 0;
2753
2754 ret = __parse_numa_zonelist_order(s);
2755 if (ret == 0)
2756 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2757
2758 return ret;
2585} 2759}
2586early_param("numa_zonelist_order", setup_numa_zonelist_order); 2760early_param("numa_zonelist_order", setup_numa_zonelist_order);
2587 2761
@@ -3007,14 +3181,6 @@ static __init_refok int __build_all_zonelists(void *data)
3007 build_zonelist_cache(pgdat); 3181 build_zonelist_cache(pgdat);
3008 } 3182 }
3009 3183
3010#ifdef CONFIG_MEMORY_HOTPLUG
3011 /* Setup real pagesets for the new zone */
3012 if (data) {
3013 struct zone *zone = data;
3014 setup_zone_pageset(zone);
3015 }
3016#endif
3017
3018 /* 3184 /*
3019 * Initialize the boot_pagesets that are going to be used 3185 * Initialize the boot_pagesets that are going to be used
3020 * for bootstrapping processors. The real pagesets for 3186 * for bootstrapping processors. The real pagesets for
@@ -3052,7 +3218,7 @@ static __init_refok int __build_all_zonelists(void *data)
3052 * Called with zonelists_mutex held always 3218 * Called with zonelists_mutex held always
3053 * unless system_state == SYSTEM_BOOTING. 3219 * unless system_state == SYSTEM_BOOTING.
3054 */ 3220 */
3055void build_all_zonelists(void *data) 3221void __ref build_all_zonelists(void *data)
3056{ 3222{
3057 set_zonelist_order(); 3223 set_zonelist_order();
3058 3224
@@ -3063,7 +3229,11 @@ void build_all_zonelists(void *data)
3063 } else { 3229 } else {
3064 /* we have to stop all cpus to guarantee there is no user 3230 /* we have to stop all cpus to guarantee there is no user
3065 of zonelist */ 3231 of zonelist */
3066 stop_machine(__build_all_zonelists, data, NULL); 3232#ifdef CONFIG_MEMORY_HOTPLUG
3233 if (data)
3234 setup_zone_pageset((struct zone *)data);
3235#endif
3236 stop_machine(__build_all_zonelists, NULL, NULL);
3067 /* cpuset refresh routine should be here */ 3237 /* cpuset refresh routine should be here */
3068 } 3238 }
3069 vm_total_pages = nr_free_pagecache_pages(); 3239 vm_total_pages = nr_free_pagecache_pages();
@@ -3159,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
3159#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 3329#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
3160 3330
3161/* 3331/*
3332 * Check if a pageblock contains reserved pages
3333 */
3334static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
3335{
3336 unsigned long pfn;
3337
3338 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
3339 if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
3340 return 1;
3341 }
3342 return 0;
3343}
3344
3345/*
3162 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 3346 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
3163 * of blocks reserved is based on min_wmark_pages(zone). The memory within 3347 * of blocks reserved is based on min_wmark_pages(zone). The memory within
3164 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes 3348 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3167,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
3167 */ 3351 */
3168static void setup_zone_migrate_reserve(struct zone *zone) 3352static void setup_zone_migrate_reserve(struct zone *zone)
3169{ 3353{
3170 unsigned long start_pfn, pfn, end_pfn; 3354 unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
3171 struct page *page; 3355 struct page *page;
3172 unsigned long block_migratetype; 3356 unsigned long block_migratetype;
3173 int reserve; 3357 int reserve;
@@ -3197,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3197 continue; 3381 continue;
3198 3382
3199 /* Blocks with reserved pages will never free, skip them. */ 3383 /* Blocks with reserved pages will never free, skip them. */
3200 if (PageReserved(page)) 3384 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3385 if (pageblock_is_reserved(pfn, block_end_pfn))
3201 continue; 3386 continue;
3202 3387
3203 block_migratetype = get_pageblock_migratetype(page); 3388 block_migratetype = get_pageblock_migratetype(page);
@@ -3386,7 +3571,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3386 pcp->batch = PAGE_SHIFT * 8; 3571 pcp->batch = PAGE_SHIFT * 8;
3387} 3572}
3388 3573
3389static __meminit void setup_zone_pageset(struct zone *zone) 3574static void setup_zone_pageset(struct zone *zone)
3390{ 3575{
3391 int cpu; 3576 int cpu;
3392 3577
@@ -3436,7 +3621,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3436 3621
3437 if (!slab_is_available()) { 3622 if (!slab_is_available()) {
3438 zone->wait_table = (wait_queue_head_t *) 3623 zone->wait_table = (wait_queue_head_t *)
3439 alloc_bootmem_node(pgdat, alloc_size); 3624 alloc_bootmem_node_nopanic(pgdat, alloc_size);
3440 } else { 3625 } else {
3441 /* 3626 /*
3442 * This case means that a zone whose size was 0 gets new memory 3627 * This case means that a zone whose size was 0 gets new memory
@@ -3636,68 +3821,87 @@ void __init free_bootmem_with_active_regions(int nid,
3636 } 3821 }
3637} 3822}
3638 3823
3639int __init add_from_early_node_map(struct range *range, int az, 3824#ifdef CONFIG_HAVE_MEMBLOCK
3640 int nr_range, int nid) 3825/*
3826 * Basic iterator support. Return the last range of PFNs for a node
3827 * Note: nid == MAX_NUMNODES returns last region regardless of node
3828 */
3829static int __meminit last_active_region_index_in_nid(int nid)
3641{ 3830{
3642 int i; 3831 int i;
3643 u64 start, end;
3644 3832
3645 /* need to go over early_node_map to find out good range for node */ 3833 for (i = nr_nodemap_entries - 1; i >= 0; i--)
3646 for_each_active_range_index_in_nid(i, nid) { 3834 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
3647 start = early_node_map[i].start_pfn; 3835 return i;
3648 end = early_node_map[i].end_pfn; 3836
3649 nr_range = add_range(range, az, nr_range, start, end); 3837 return -1;
3650 }
3651 return nr_range;
3652} 3838}
3653 3839
3654#ifdef CONFIG_NO_BOOTMEM 3840/*
3655void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, 3841 * Basic iterator support. Return the previous active range of PFNs for a node
3842 * Note: nid == MAX_NUMNODES returns next region regardless of node
3843 */
3844static int __meminit previous_active_region_index_in_nid(int index, int nid)
3845{
3846 for (index = index - 1; index >= 0; index--)
3847 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
3848 return index;
3849
3850 return -1;
3851}
3852
3853#define for_each_active_range_index_in_nid_reverse(i, nid) \
3854 for (i = last_active_region_index_in_nid(nid); i != -1; \
3855 i = previous_active_region_index_in_nid(i, nid))
3856
3857u64 __init find_memory_core_early(int nid, u64 size, u64 align,
3656 u64 goal, u64 limit) 3858 u64 goal, u64 limit)
3657{ 3859{
3658 int i; 3860 int i;
3659 void *ptr;
3660
3661 if (limit > get_max_mapped())
3662 limit = get_max_mapped();
3663 3861
3664 /* need to go over early_node_map to find out good range for node */ 3862 /* Need to go over early_node_map to find out good range for node */
3665 for_each_active_range_index_in_nid(i, nid) { 3863 for_each_active_range_index_in_nid_reverse(i, nid) {
3666 u64 addr; 3864 u64 addr;
3667 u64 ei_start, ei_last; 3865 u64 ei_start, ei_last;
3866 u64 final_start, final_end;
3668 3867
3669 ei_last = early_node_map[i].end_pfn; 3868 ei_last = early_node_map[i].end_pfn;
3670 ei_last <<= PAGE_SHIFT; 3869 ei_last <<= PAGE_SHIFT;
3671 ei_start = early_node_map[i].start_pfn; 3870 ei_start = early_node_map[i].start_pfn;
3672 ei_start <<= PAGE_SHIFT; 3871 ei_start <<= PAGE_SHIFT;
3673 addr = find_early_area(ei_start, ei_last,
3674 goal, limit, size, align);
3675 3872
3676 if (addr == -1ULL) 3873 final_start = max(ei_start, goal);
3874 final_end = min(ei_last, limit);
3875
3876 if (final_start >= final_end)
3677 continue; 3877 continue;
3678 3878
3679#if 0 3879 addr = memblock_find_in_range(final_start, final_end, size, align);
3680 printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
3681 nid,
3682 ei_start, ei_last, goal, limit, size,
3683 align, addr);
3684#endif
3685 3880
3686 ptr = phys_to_virt(addr); 3881 if (addr == MEMBLOCK_ERROR)
3687 memset(ptr, 0, size); 3882 continue;
3688 reserve_early_without_check(addr, addr + size, "BOOTMEM"); 3883
3689 /* 3884 return addr;
3690 * The min_count is set to 0 so that bootmem allocated blocks
3691 * are never reported as leaks.
3692 */
3693 kmemleak_alloc(ptr, size, 0, 0);
3694 return ptr;
3695 } 3885 }
3696 3886
3697 return NULL; 3887 return MEMBLOCK_ERROR;
3698} 3888}
3699#endif 3889#endif
3700 3890
3891int __init add_from_early_node_map(struct range *range, int az,
3892 int nr_range, int nid)
3893{
3894 int i;
3895 u64 start, end;
3896
3897 /* need to go over early_node_map to find out good range for node */
3898 for_each_active_range_index_in_nid(i, nid) {
3899 start = early_node_map[i].start_pfn;
3900 end = early_node_map[i].end_pfn;
3901 nr_range = add_range(range, az, nr_range, start, end);
3902 }
3903 return nr_range;
3904}
3701 3905
3702void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) 3906void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
3703{ 3907{
@@ -3779,7 +3983,7 @@ static void __init find_usable_zone_for_movable(void)
3779 3983
3780/* 3984/*
3781 * The zone ranges provided by the architecture do not include ZONE_MOVABLE 3985 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
3782 * because it is sized independant of architecture. Unlike the other zones, 3986 * because it is sized independent of architecture. Unlike the other zones,
3783 * the starting point for ZONE_MOVABLE is not fixed. It may be different 3987 * the starting point for ZONE_MOVABLE is not fixed. It may be different
3784 * in each node depending on the size of each node and how evenly kernelcore 3988 * in each node depending on the size of each node and how evenly kernelcore
3785 * is distributed. This helper function adjusts the zone ranges 3989 * is distributed. This helper function adjusts the zone ranges
@@ -3994,10 +4198,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
3994 unsigned long usemapsize = usemap_size(zonesize); 4198 unsigned long usemapsize = usemap_size(zonesize);
3995 zone->pageblock_flags = NULL; 4199 zone->pageblock_flags = NULL;
3996 if (usemapsize) 4200 if (usemapsize)
3997 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4201 zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
4202 usemapsize);
3998} 4203}
3999#else 4204#else
4000static void inline setup_usemap(struct pglist_data *pgdat, 4205static inline void setup_usemap(struct pglist_data *pgdat,
4001 struct zone *zone, unsigned long zonesize) {} 4206 struct zone *zone, unsigned long zonesize) {}
4002#endif /* CONFIG_SPARSEMEM */ 4207#endif /* CONFIG_SPARSEMEM */
4003 4208
@@ -4114,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4114 zone->zone_pgdat = pgdat; 4319 zone->zone_pgdat = pgdat;
4115 4320
4116 zone_pcp_init(zone); 4321 zone_pcp_init(zone);
4117 for_each_lru(l) { 4322 for_each_lru(l)
4118 INIT_LIST_HEAD(&zone->lru[l].list); 4323 INIT_LIST_HEAD(&zone->lru[l].list);
4119 zone->reclaim_stat.nr_saved_scan[l] = 0;
4120 }
4121 zone->reclaim_stat.recent_rotated[0] = 0; 4324 zone->reclaim_stat.recent_rotated[0] = 0;
4122 zone->reclaim_stat.recent_rotated[1] = 0; 4325 zone->reclaim_stat.recent_rotated[1] = 0;
4123 zone->reclaim_stat.recent_scanned[0] = 0; 4326 zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4160,7 +4363,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
4160 size = (end - start) * sizeof(struct page); 4363 size = (end - start) * sizeof(struct page);
4161 map = alloc_remap(pgdat->node_id, size); 4364 map = alloc_remap(pgdat->node_id, size);
4162 if (!map) 4365 if (!map)
4163 map = alloc_bootmem_node(pgdat, size); 4366 map = alloc_bootmem_node_nopanic(pgdat, size);
4164 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 4367 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
4165 } 4368 }
4166#ifndef CONFIG_NEED_MULTIPLE_NODES 4369#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4732,15 +4935,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
4732 dma_reserve = new_dma_reserve; 4935 dma_reserve = new_dma_reserve;
4733} 4936}
4734 4937
4735#ifndef CONFIG_NEED_MULTIPLE_NODES
4736struct pglist_data __refdata contig_page_data = {
4737#ifndef CONFIG_NO_BOOTMEM
4738 .bdata = &bootmem_node_data[0]
4739#endif
4740 };
4741EXPORT_SYMBOL(contig_page_data);
4742#endif
4743
4744void __init free_area_init(unsigned long *zones_size) 4938void __init free_area_init(unsigned long *zones_size)
4745{ 4939{
4746 free_area_init_node(0, zones_size, 4940 free_area_init_node(0, zones_size,
@@ -4934,7 +5128,7 @@ void setup_per_zone_wmarks(void)
4934 * 1TB 101 10GB 5128 * 1TB 101 10GB
4935 * 10TB 320 32GB 5129 * 10TB 320 32GB
4936 */ 5130 */
4937void calculate_zone_inactive_ratio(struct zone *zone) 5131static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
4938{ 5132{
4939 unsigned int gb, ratio; 5133 unsigned int gb, ratio;
4940 5134
@@ -4948,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
4948 zone->inactive_ratio = ratio; 5142 zone->inactive_ratio = ratio;
4949} 5143}
4950 5144
4951static void __init setup_per_zone_inactive_ratio(void) 5145static void __meminit setup_per_zone_inactive_ratio(void)
4952{ 5146{
4953 struct zone *zone; 5147 struct zone *zone;
4954 5148
@@ -4980,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void)
4980 * 8192MB: 11584k 5174 * 8192MB: 11584k
4981 * 16384MB: 16384k 5175 * 16384MB: 16384k
4982 */ 5176 */
4983static int __init init_per_zone_wmark_min(void) 5177int __meminit init_per_zone_wmark_min(void)
4984{ 5178{
4985 unsigned long lowmem_kbytes; 5179 unsigned long lowmem_kbytes;
4986 5180
@@ -4992,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void)
4992 if (min_free_kbytes > 65536) 5186 if (min_free_kbytes > 65536)
4993 min_free_kbytes = 65536; 5187 min_free_kbytes = 65536;
4994 setup_per_zone_wmarks(); 5188 setup_per_zone_wmarks();
5189 refresh_zone_stat_thresholds();
4995 setup_per_zone_lowmem_reserve(); 5190 setup_per_zone_lowmem_reserve();
4996 setup_per_zone_inactive_ratio(); 5191 setup_per_zone_inactive_ratio();
4997 return 0; 5192 return 0;
@@ -5281,26 +5476,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5281 * page allocater never alloc memory from ISOLATE block. 5476 * page allocater never alloc memory from ISOLATE block.
5282 */ 5477 */
5283 5478
5479static int
5480__count_immobile_pages(struct zone *zone, struct page *page, int count)
5481{
5482 unsigned long pfn, iter, found;
5483 /*
5484 * For avoiding noise data, lru_add_drain_all() should be called
5485 * If ZONE_MOVABLE, the zone never contains immobile pages
5486 */
5487 if (zone_idx(zone) == ZONE_MOVABLE)
5488 return true;
5489
5490 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
5491 return true;
5492
5493 pfn = page_to_pfn(page);
5494 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
5495 unsigned long check = pfn + iter;
5496
5497 if (!pfn_valid_within(check))
5498 continue;
5499
5500 page = pfn_to_page(check);
5501 if (!page_count(page)) {
5502 if (PageBuddy(page))
5503 iter += (1 << page_order(page)) - 1;
5504 continue;
5505 }
5506 if (!PageLRU(page))
5507 found++;
5508 /*
5509 * If there are RECLAIMABLE pages, we need to check it.
5510 * But now, memory offline itself doesn't call shrink_slab()
5511 * and it still to be fixed.
5512 */
5513 /*
5514 * If the page is not RAM, page_count()should be 0.
5515 * we don't need more check. This is an _used_ not-movable page.
5516 *
5517 * The problematic thing here is PG_reserved pages. PG_reserved
5518 * is set to both of a memory hole page and a _used_ kernel
5519 * page at boot.
5520 */
5521 if (found > count)
5522 return false;
5523 }
5524 return true;
5525}
5526
5527bool is_pageblock_removable_nolock(struct page *page)
5528{
5529 struct zone *zone = page_zone(page);
5530 return __count_immobile_pages(zone, page, 0);
5531}
5532
5284int set_migratetype_isolate(struct page *page) 5533int set_migratetype_isolate(struct page *page)
5285{ 5534{
5286 struct zone *zone; 5535 struct zone *zone;
5287 struct page *curr_page; 5536 unsigned long flags, pfn;
5288 unsigned long flags, pfn, iter;
5289 unsigned long immobile = 0;
5290 struct memory_isolate_notify arg; 5537 struct memory_isolate_notify arg;
5291 int notifier_ret; 5538 int notifier_ret;
5292 int ret = -EBUSY; 5539 int ret = -EBUSY;
5293 int zone_idx;
5294 5540
5295 zone = page_zone(page); 5541 zone = page_zone(page);
5296 zone_idx = zone_idx(zone);
5297 5542
5298 spin_lock_irqsave(&zone->lock, flags); 5543 spin_lock_irqsave(&zone->lock, flags);
5299 if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
5300 zone_idx == ZONE_MOVABLE) {
5301 ret = 0;
5302 goto out;
5303 }
5304 5544
5305 pfn = page_to_pfn(page); 5545 pfn = page_to_pfn(page);
5306 arg.start_pfn = pfn; 5546 arg.start_pfn = pfn;
@@ -5320,23 +5560,20 @@ int set_migratetype_isolate(struct page *page)
5320 */ 5560 */
5321 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); 5561 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5322 notifier_ret = notifier_to_errno(notifier_ret); 5562 notifier_ret = notifier_to_errno(notifier_ret);
5323 if (notifier_ret || !arg.pages_found) 5563 if (notifier_ret)
5324 goto out; 5564 goto out;
5325 5565 /*
5326 for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { 5566 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5327 if (!pfn_valid_within(pfn)) 5567 * We just check MOVABLE pages.
5328 continue; 5568 */
5329 5569 if (__count_immobile_pages(zone, page, arg.pages_found))
5330 curr_page = pfn_to_page(iter);
5331 if (!page_count(curr_page) || PageLRU(curr_page))
5332 continue;
5333
5334 immobile++;
5335 }
5336
5337 if (arg.pages_found == immobile)
5338 ret = 0; 5570 ret = 0;
5339 5571
5572 /*
5573 * immobile means "not-on-lru" paes. If immobile is larger than
5574 * removable-by-driver pages reported by notifier, we'll fail.
5575 */
5576
5340out: 5577out:
5341 if (!ret) { 5578 if (!ret) {
5342 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 5579 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5455,7 +5692,6 @@ static struct trace_print_flags pageflag_names[] = {
5455 {1UL << PG_swapcache, "swapcache" }, 5692 {1UL << PG_swapcache, "swapcache" },
5456 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5693 {1UL << PG_mappedtodisk, "mappedtodisk" },
5457 {1UL << PG_reclaim, "reclaim" }, 5694 {1UL << PG_reclaim, "reclaim" },
5458 {1UL << PG_buddy, "buddy" },
5459 {1UL << PG_swapbacked, "swapbacked" }, 5695 {1UL << PG_swapbacked, "swapbacked" },
5460 {1UL << PG_unevictable, "unevictable" }, 5696 {1UL << PG_unevictable, "unevictable" },
5461#ifdef CONFIG_MMU 5697#ifdef CONFIG_MMU
@@ -5503,7 +5739,8 @@ void dump_page(struct page *page)
5503{ 5739{
5504 printk(KERN_ALERT 5740 printk(KERN_ALERT
5505 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5741 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5506 page, page_count(page), page_mapcount(page), 5742 page, atomic_read(&page->_count), page_mapcount(page),
5507 page->mapping, page->index); 5743 page->mapping, page->index);
5508 dump_page_flags(page->flags); 5744 dump_page_flags(page->flags);
5745 mem_cgroup_print_bad_page(page);
5509} 5746}