aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c936
1 files changed, 555 insertions, 381 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c44ed49ca93..a0de15f46987 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -46,6 +47,7 @@
46#include <linux/page-isolation.h> 47#include <linux/page-isolation.h>
47#include <linux/page_cgroup.h> 48#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 49#include <linux/debugobjects.h>
50#include <linux/kmemleak.h>
49 51
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
51#include <asm/div64.h> 53#include <asm/div64.h>
@@ -71,6 +73,7 @@ unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 73unsigned long totalreserve_pages __read_mostly;
72unsigned long highest_memmap_pfn __read_mostly; 74unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 75int percpu_pagelist_fraction;
76gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
74 77
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 78#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
76int pageblock_order __read_mostly; 79int pageblock_order __read_mostly;
@@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve;
149 static int __meminitdata nr_nodemap_entries; 152 static int __meminitdata nr_nodemap_entries;
150 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 153 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
151 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 154 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
152#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
153 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
154 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
155#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
156 static unsigned long __initdata required_kernelcore; 155 static unsigned long __initdata required_kernelcore;
157 static unsigned long __initdata required_movablecore; 156 static unsigned long __initdata required_movablecore;
158 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 157 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve;
164 163
165#if MAX_NUMNODES > 1 164#if MAX_NUMNODES > 1
166int nr_node_ids __read_mostly = MAX_NUMNODES; 165int nr_node_ids __read_mostly = MAX_NUMNODES;
166int nr_online_nodes __read_mostly = 1;
167EXPORT_SYMBOL(nr_node_ids); 167EXPORT_SYMBOL(nr_node_ids);
168EXPORT_SYMBOL(nr_online_nodes);
168#endif 169#endif
169 170
170int page_group_by_mobility_disabled __read_mostly; 171int page_group_by_mobility_disabled __read_mostly;
171 172
172static void set_pageblock_migratetype(struct page *page, int migratetype) 173static void set_pageblock_migratetype(struct page *page, int migratetype)
173{ 174{
175
176 if (unlikely(page_group_by_mobility_disabled))
177 migratetype = MIGRATE_UNMOVABLE;
178
174 set_pageblock_flags_group(page, (unsigned long)migratetype, 179 set_pageblock_flags_group(page, (unsigned long)migratetype,
175 PB_migrate, PB_migrate_end); 180 PB_migrate, PB_migrate_end);
176} 181}
177 182
183bool oom_killer_disabled __read_mostly;
184
178#ifdef CONFIG_DEBUG_VM 185#ifdef CONFIG_DEBUG_VM
179static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 186static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
180{ 187{
@@ -297,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order)
297 } 304 }
298} 305}
299 306
300#ifdef CONFIG_HUGETLBFS
301void prep_compound_gigantic_page(struct page *page, unsigned long order)
302{
303 int i;
304 int nr_pages = 1 << order;
305 struct page *p = page + 1;
306
307 set_compound_page_dtor(page, free_compound_page);
308 set_compound_order(page, order);
309 __SetPageHead(page);
310 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
311 __SetPageTail(p);
312 p->first_page = page;
313 }
314}
315#endif
316
317static int destroy_compound_page(struct page *page, unsigned long order) 307static int destroy_compound_page(struct page *page, unsigned long order)
318{ 308{
319 int i; 309 int i;
@@ -331,7 +321,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
331 for (i = 1; i < nr_pages; i++) { 321 for (i = 1; i < nr_pages; i++) {
332 struct page *p = page + i; 322 struct page *p = page + i;
333 323
334 if (unlikely(!PageTail(p) | (p->first_page != page))) { 324 if (unlikely(!PageTail(p) || (p->first_page != page))) {
335 bad_page(page); 325 bad_page(page);
336 bad++; 326 bad++;
337 } 327 }
@@ -420,7 +410,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
420 return 0; 410 return 0;
421 411
422 if (PageBuddy(buddy) && page_order(buddy) == order) { 412 if (PageBuddy(buddy) && page_order(buddy) == order) {
423 BUG_ON(page_count(buddy) != 0); 413 VM_BUG_ON(page_count(buddy) != 0);
424 return 1; 414 return 1;
425 } 415 }
426 return 0; 416 return 0;
@@ -451,22 +441,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
451 */ 441 */
452 442
453static inline void __free_one_page(struct page *page, 443static inline void __free_one_page(struct page *page,
454 struct zone *zone, unsigned int order) 444 struct zone *zone, unsigned int order,
445 int migratetype)
455{ 446{
456 unsigned long page_idx; 447 unsigned long page_idx;
457 int order_size = 1 << order;
458 int migratetype = get_pageblock_migratetype(page);
459 448
460 if (unlikely(PageCompound(page))) 449 if (unlikely(PageCompound(page)))
461 if (unlikely(destroy_compound_page(page, order))) 450 if (unlikely(destroy_compound_page(page, order)))
462 return; 451 return;
463 452
453 VM_BUG_ON(migratetype == -1);
454
464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 455 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
465 456
466 VM_BUG_ON(page_idx & (order_size - 1)); 457 VM_BUG_ON(page_idx & ((1 << order) - 1));
467 VM_BUG_ON(bad_range(zone, page)); 458 VM_BUG_ON(bad_range(zone, page));
468 459
469 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
470 while (order < MAX_ORDER-1) { 460 while (order < MAX_ORDER-1) {
471 unsigned long combined_idx; 461 unsigned long combined_idx;
472 struct page *buddy; 462 struct page *buddy;
@@ -490,12 +480,26 @@ static inline void __free_one_page(struct page *page,
490 zone->free_area[order].nr_free++; 480 zone->free_area[order].nr_free++;
491} 481}
492 482
483#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
484/*
485 * free_page_mlock() -- clean up attempts to free and mlocked() page.
486 * Page should not be on lru, so no need to fix that up.
487 * free_pages_check() will verify...
488 */
489static inline void free_page_mlock(struct page *page)
490{
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
493static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
494{ 499{
495 free_page_mlock(page);
496 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
497 (page->mapping != NULL) | 501 (page->mapping != NULL) |
498 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
500 bad_page(page); 504 bad_page(page);
501 return 1; 505 return 1;
@@ -522,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
522 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
523 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
524 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
525 while (count--) { 531 while (count--) {
526 struct page *page; 532 struct page *page;
527 533
@@ -529,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
529 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
530 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
531 list_del(&page->lru); 537 list_del(&page->lru);
532 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
533 } 539 }
534 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
535} 541}
536 542
537static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
538{ 545{
539 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
540 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
541 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
542 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
543 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
544} 553}
545 554
@@ -548,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
548 unsigned long flags; 557 unsigned long flags;
549 int i; 558 int i;
550 int bad = 0; 559 int bad = 0;
560 int wasMlocked = TestClearPageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
551 563
552 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
553 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
564 576
565 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(wasMlocked))
579 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 583 local_irq_restore(flags);
569} 584}
570 585
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
635{ 650{
636 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 652 (page->mapping != NULL) |
638 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 655 bad_page(page);
641 return 1; 656 return 1;
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 676 * the smallest available page from the freelists
662 */ 677 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 680 int migratetype)
665{ 681{
666 unsigned int current_order; 682 unsigned int current_order;
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 694 list_del(&page->lru);
679 rmv_page_order(page); 695 rmv_page_order(page);
680 area->nr_free--; 696 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
683 return page; 698 return page;
684 } 699 }
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
769} 784}
770 785
771/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
773 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 789{
775 struct free_area * area; 790 struct free_area * area;
776 int current_order; 791 int current_order;
@@ -802,13 +817,15 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
802 * agressive about taking ownership of free pages 817 * agressive about taking ownership of free pages
803 */ 818 */
804 if (unlikely(current_order >= (pageblock_order >> 1)) || 819 if (unlikely(current_order >= (pageblock_order >> 1)) ||
805 start_migratetype == MIGRATE_RECLAIMABLE) { 820 start_migratetype == MIGRATE_RECLAIMABLE ||
821 page_group_by_mobility_disabled) {
806 unsigned long pages; 822 unsigned long pages;
807 pages = move_freepages_block(zone, page, 823 pages = move_freepages_block(zone, page,
808 start_migratetype); 824 start_migratetype);
809 825
810 /* Claim the whole block if over half of it is free */ 826 /* Claim the whole block if over half of it is free */
811 if (pages >= (1 << (pageblock_order-1))) 827 if (pages >= (1 << (pageblock_order-1)) ||
828 page_group_by_mobility_disabled)
812 set_pageblock_migratetype(page, 829 set_pageblock_migratetype(page,
813 start_migratetype); 830 start_migratetype);
814 831
@@ -818,8 +835,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 835 /* Remove the page from the freelists */
819 list_del(&page->lru); 836 list_del(&page->lru);
820 rmv_page_order(page); 837 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 838
824 if (current_order == pageblock_order) 839 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page, 840 set_pageblock_migratetype(page,
@@ -830,8 +845,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
830 } 845 }
831 } 846 }
832 847
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 848 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 849}
836 850
837/* 851/*
@@ -843,11 +857,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 857{
844 struct page *page; 858 struct page *page;
845 859
860retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 861 page = __rmqueue_smallest(zone, order, migratetype);
847 862
848 if (unlikely(!page)) 863 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 864 page = __rmqueue_fallback(zone, order, migratetype);
850 865
866 /*
867 * Use MIGRATE_RESERVE rather than fail an allocation. goto
868 * is used because __rmqueue_smallest is an inline function
869 * and we want just one call site
870 */
871 if (!page) {
872 migratetype = MIGRATE_RESERVE;
873 goto retry_reserve;
874 }
875 }
876
851 return page; 877 return page;
852} 878}
853 879
@@ -858,7 +884,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
858 */ 884 */
859static int rmqueue_bulk(struct zone *zone, unsigned int order, 885static int rmqueue_bulk(struct zone *zone, unsigned int order,
860 unsigned long count, struct list_head *list, 886 unsigned long count, struct list_head *list,
861 int migratetype) 887 int migratetype, int cold)
862{ 888{
863 int i; 889 int i;
864 890
@@ -877,10 +903,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
877 * merge IO requests if the physical pages are ordered 903 * merge IO requests if the physical pages are ordered
878 * properly. 904 * properly.
879 */ 905 */
880 list_add(&page->lru, list); 906 if (likely(cold == 0))
907 list_add(&page->lru, list);
908 else
909 list_add_tail(&page->lru, list);
881 set_page_private(page, migratetype); 910 set_page_private(page, migratetype);
882 list = &page->lru; 911 list = &page->lru;
883 } 912 }
913 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 914 spin_unlock(&zone->lock);
885 return i; 915 return i;
886} 916}
@@ -922,13 +952,10 @@ static void drain_pages(unsigned int cpu)
922 unsigned long flags; 952 unsigned long flags;
923 struct zone *zone; 953 struct zone *zone;
924 954
925 for_each_zone(zone) { 955 for_each_populated_zone(zone) {
926 struct per_cpu_pageset *pset; 956 struct per_cpu_pageset *pset;
927 struct per_cpu_pages *pcp; 957 struct per_cpu_pages *pcp;
928 958
929 if (!populated_zone(zone))
930 continue;
931
932 pset = zone_pcp(zone, cpu); 959 pset = zone_pcp(zone, cpu);
933 960
934 pcp = &pset->pcp; 961 pcp = &pset->pcp;
@@ -999,6 +1026,9 @@ static void free_hot_cold_page(struct page *page, int cold)
999 struct zone *zone = page_zone(page); 1026 struct zone *zone = page_zone(page);
1000 struct per_cpu_pages *pcp; 1027 struct per_cpu_pages *pcp;
1001 unsigned long flags; 1028 unsigned long flags;
1029 int wasMlocked = TestClearPageMlocked(page);
1030
1031 kmemcheck_free_shadow(page, 0);
1002 1032
1003 if (PageAnon(page)) 1033 if (PageAnon(page))
1004 page->mapping = NULL; 1034 page->mapping = NULL;
@@ -1013,13 +1043,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1013 kernel_map_pages(page, 1, 0); 1043 kernel_map_pages(page, 1, 0);
1014 1044
1015 pcp = &zone_pcp(zone, get_cpu())->pcp; 1045 pcp = &zone_pcp(zone, get_cpu())->pcp;
1046 set_page_private(page, get_pageblock_migratetype(page));
1016 local_irq_save(flags); 1047 local_irq_save(flags);
1048 if (unlikely(wasMlocked))
1049 free_page_mlock(page);
1017 __count_vm_event(PGFREE); 1050 __count_vm_event(PGFREE);
1051
1018 if (cold) 1052 if (cold)
1019 list_add_tail(&page->lru, &pcp->list); 1053 list_add_tail(&page->lru, &pcp->list);
1020 else 1054 else
1021 list_add(&page->lru, &pcp->list); 1055 list_add(&page->lru, &pcp->list);
1022 set_page_private(page, get_pageblock_migratetype(page));
1023 pcp->count++; 1056 pcp->count++;
1024 if (pcp->count >= pcp->high) { 1057 if (pcp->count >= pcp->high) {
1025 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1058 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1053,6 +1086,16 @@ void split_page(struct page *page, unsigned int order)
1053 1086
1054 VM_BUG_ON(PageCompound(page)); 1087 VM_BUG_ON(PageCompound(page));
1055 VM_BUG_ON(!page_count(page)); 1088 VM_BUG_ON(!page_count(page));
1089
1090#ifdef CONFIG_KMEMCHECK
1091 /*
1092 * Split shadow pages too, because free(page[0]) would
1093 * otherwise free the whole shadow.
1094 */
1095 if (kmemcheck_page_is_tracked(page))
1096 split_page(virt_to_page(page[0].shadow), order);
1097#endif
1098
1056 for (i = 1; i < (1 << order); i++) 1099 for (i = 1; i < (1 << order); i++)
1057 set_page_refcounted(page + i); 1100 set_page_refcounted(page + i);
1058} 1101}
@@ -1062,14 +1105,15 @@ void split_page(struct page *page, unsigned int order)
1062 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1105 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1063 * or two. 1106 * or two.
1064 */ 1107 */
1065static struct page *buffered_rmqueue(struct zone *preferred_zone, 1108static inline
1066 struct zone *zone, int order, gfp_t gfp_flags) 1109struct page *buffered_rmqueue(struct zone *preferred_zone,
1110 struct zone *zone, int order, gfp_t gfp_flags,
1111 int migratetype)
1067{ 1112{
1068 unsigned long flags; 1113 unsigned long flags;
1069 struct page *page; 1114 struct page *page;
1070 int cold = !!(gfp_flags & __GFP_COLD); 1115 int cold = !!(gfp_flags & __GFP_COLD);
1071 int cpu; 1116 int cpu;
1072 int migratetype = allocflags_to_migratetype(gfp_flags);
1073 1117
1074again: 1118again:
1075 cpu = get_cpu(); 1119 cpu = get_cpu();
@@ -1080,7 +1124,8 @@ again:
1080 local_irq_save(flags); 1124 local_irq_save(flags);
1081 if (!pcp->count) { 1125 if (!pcp->count) {
1082 pcp->count = rmqueue_bulk(zone, 0, 1126 pcp->count = rmqueue_bulk(zone, 0,
1083 pcp->batch, &pcp->list, migratetype); 1127 pcp->batch, &pcp->list,
1128 migratetype, cold);
1084 if (unlikely(!pcp->count)) 1129 if (unlikely(!pcp->count))
1085 goto failed; 1130 goto failed;
1086 } 1131 }
@@ -1099,15 +1144,30 @@ again:
1099 /* Allocate more to the pcp list if necessary */ 1144 /* Allocate more to the pcp list if necessary */
1100 if (unlikely(&page->lru == &pcp->list)) { 1145 if (unlikely(&page->lru == &pcp->list)) {
1101 pcp->count += rmqueue_bulk(zone, 0, 1146 pcp->count += rmqueue_bulk(zone, 0,
1102 pcp->batch, &pcp->list, migratetype); 1147 pcp->batch, &pcp->list,
1148 migratetype, cold);
1103 page = list_entry(pcp->list.next, struct page, lru); 1149 page = list_entry(pcp->list.next, struct page, lru);
1104 } 1150 }
1105 1151
1106 list_del(&page->lru); 1152 list_del(&page->lru);
1107 pcp->count--; 1153 pcp->count--;
1108 } else { 1154 } else {
1155 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1156 /*
1157 * __GFP_NOFAIL is not to be used in new code.
1158 *
1159 * All __GFP_NOFAIL callers should be fixed so that they
1160 * properly detect and handle allocation failures.
1161 *
1162 * We most definitely don't want callers attempting to
1163 * allocate greater than order-1 page units with
1164 * __GFP_NOFAIL.
1165 */
1166 WARN_ON_ONCE(order > 1);
1167 }
1109 spin_lock_irqsave(&zone->lock, flags); 1168 spin_lock_irqsave(&zone->lock, flags);
1110 page = __rmqueue(zone, order, migratetype); 1169 page = __rmqueue(zone, order, migratetype);
1170 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1111 spin_unlock(&zone->lock); 1171 spin_unlock(&zone->lock);
1112 if (!page) 1172 if (!page)
1113 goto failed; 1173 goto failed;
@@ -1129,10 +1189,15 @@ failed:
1129 return NULL; 1189 return NULL;
1130} 1190}
1131 1191
1132#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1192/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1133#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1193#define ALLOC_WMARK_MIN WMARK_MIN
1134#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1194#define ALLOC_WMARK_LOW WMARK_LOW
1135#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1195#define ALLOC_WMARK_HIGH WMARK_HIGH
1196#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1197
1198/* Mask to get the watermark bits */
1199#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1200
1136#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1201#define ALLOC_HARDER 0x10 /* try to alloc harder */
1137#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1202#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1138#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1203#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1390,23 +1455,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1390 */ 1455 */
1391static struct page * 1456static struct page *
1392get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1457get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1393 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1458 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1459 struct zone *preferred_zone, int migratetype)
1394{ 1460{
1395 struct zoneref *z; 1461 struct zoneref *z;
1396 struct page *page = NULL; 1462 struct page *page = NULL;
1397 int classzone_idx; 1463 int classzone_idx;
1398 struct zone *zone, *preferred_zone; 1464 struct zone *zone;
1399 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1465 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1400 int zlc_active = 0; /* set if using zonelist_cache */ 1466 int zlc_active = 0; /* set if using zonelist_cache */
1401 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1467 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1402 1468
1403 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1404 &preferred_zone);
1405 if (!preferred_zone)
1406 return NULL;
1407
1408 classzone_idx = zone_idx(preferred_zone); 1469 classzone_idx = zone_idx(preferred_zone);
1409
1410zonelist_scan: 1470zonelist_scan:
1411 /* 1471 /*
1412 * Scan zonelist, looking for a zone with enough free. 1472 * Scan zonelist, looking for a zone with enough free.
@@ -1421,31 +1481,49 @@ zonelist_scan:
1421 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1481 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1422 goto try_next_zone; 1482 goto try_next_zone;
1423 1483
1484 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1424 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1485 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1425 unsigned long mark; 1486 unsigned long mark;
1426 if (alloc_flags & ALLOC_WMARK_MIN) 1487 int ret;
1427 mark = zone->pages_min; 1488
1428 else if (alloc_flags & ALLOC_WMARK_LOW) 1489 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1429 mark = zone->pages_low; 1490 if (zone_watermark_ok(zone, order, mark,
1430 else 1491 classzone_idx, alloc_flags))
1431 mark = zone->pages_high; 1492 goto try_this_zone;
1432 if (!zone_watermark_ok(zone, order, mark, 1493
1433 classzone_idx, alloc_flags)) { 1494 if (zone_reclaim_mode == 0)
1434 if (!zone_reclaim_mode || 1495 goto this_zone_full;
1435 !zone_reclaim(zone, gfp_mask, order)) 1496
1497 ret = zone_reclaim(zone, gfp_mask, order);
1498 switch (ret) {
1499 case ZONE_RECLAIM_NOSCAN:
1500 /* did not scan */
1501 goto try_next_zone;
1502 case ZONE_RECLAIM_FULL:
1503 /* scanned but unreclaimable */
1504 goto this_zone_full;
1505 default:
1506 /* did we reclaim enough */
1507 if (!zone_watermark_ok(zone, order, mark,
1508 classzone_idx, alloc_flags))
1436 goto this_zone_full; 1509 goto this_zone_full;
1437 } 1510 }
1438 } 1511 }
1439 1512
1440 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1513try_this_zone:
1514 page = buffered_rmqueue(preferred_zone, zone, order,
1515 gfp_mask, migratetype);
1441 if (page) 1516 if (page)
1442 break; 1517 break;
1443this_zone_full: 1518this_zone_full:
1444 if (NUMA_BUILD) 1519 if (NUMA_BUILD)
1445 zlc_mark_zone_full(zonelist, z); 1520 zlc_mark_zone_full(zonelist, z);
1446try_next_zone: 1521try_next_zone:
1447 if (NUMA_BUILD && !did_zlc_setup) { 1522 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1448 /* we do zlc_setup after the first zone is tried */ 1523 /*
1524 * we do zlc_setup after the first zone is tried but only
1525 * if there are multiple nodes make it worthwhile
1526 */
1449 allowednodes = zlc_setup(zonelist, alloc_flags); 1527 allowednodes = zlc_setup(zonelist, alloc_flags);
1450 zlc_active = 1; 1528 zlc_active = 1;
1451 did_zlc_setup = 1; 1529 did_zlc_setup = 1;
@@ -1460,45 +1538,219 @@ try_next_zone:
1460 return page; 1538 return page;
1461} 1539}
1462 1540
1541static inline int
1542should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1543 unsigned long pages_reclaimed)
1544{
1545 /* Do not loop if specifically requested */
1546 if (gfp_mask & __GFP_NORETRY)
1547 return 0;
1548
1549 /*
1550 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1551 * means __GFP_NOFAIL, but that may not be true in other
1552 * implementations.
1553 */
1554 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1555 return 1;
1556
1557 /*
1558 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1559 * specified, then we retry until we no longer reclaim any pages
1560 * (above), or we've reclaimed an order of pages at least as
1561 * large as the allocation's order. In both cases, if the
1562 * allocation still fails, we stop retrying.
1563 */
1564 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1565 return 1;
1566
1567 /*
1568 * Don't let big-order allocations loop unless the caller
1569 * explicitly requests that.
1570 */
1571 if (gfp_mask & __GFP_NOFAIL)
1572 return 1;
1573
1574 return 0;
1575}
1576
1577static inline struct page *
1578__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1579 struct zonelist *zonelist, enum zone_type high_zoneidx,
1580 nodemask_t *nodemask, struct zone *preferred_zone,
1581 int migratetype)
1582{
1583 struct page *page;
1584
1585 /* Acquire the OOM killer lock for the zones in zonelist */
1586 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1587 schedule_timeout_uninterruptible(1);
1588 return NULL;
1589 }
1590
1591 /*
1592 * Go through the zonelist yet one more time, keep very high watermark
1593 * here, this is only to catch a parallel oom killing, we must fail if
1594 * we're still under heavy pressure.
1595 */
1596 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1597 order, zonelist, high_zoneidx,
1598 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1599 preferred_zone, migratetype);
1600 if (page)
1601 goto out;
1602
1603 /* The OOM killer will not help higher order allocs */
1604 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1605 goto out;
1606
1607 /* Exhausted what can be done so it's blamo time */
1608 out_of_memory(zonelist, gfp_mask, order);
1609
1610out:
1611 clear_zonelist_oom(zonelist, gfp_mask);
1612 return page;
1613}
1614
1615/* The really slow allocator path where we enter direct reclaim */
1616static inline struct page *
1617__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1618 struct zonelist *zonelist, enum zone_type high_zoneidx,
1619 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1620 int migratetype, unsigned long *did_some_progress)
1621{
1622 struct page *page = NULL;
1623 struct reclaim_state reclaim_state;
1624 struct task_struct *p = current;
1625
1626 cond_resched();
1627
1628 /* We now go into synchronous reclaim */
1629 cpuset_memory_pressure_bump();
1630
1631 /*
1632 * The task's cpuset might have expanded its set of allowable nodes
1633 */
1634 p->flags |= PF_MEMALLOC;
1635 lockdep_set_current_reclaim_state(gfp_mask);
1636 reclaim_state.reclaimed_slab = 0;
1637 p->reclaim_state = &reclaim_state;
1638
1639 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1640
1641 p->reclaim_state = NULL;
1642 lockdep_clear_current_reclaim_state();
1643 p->flags &= ~PF_MEMALLOC;
1644
1645 cond_resched();
1646
1647 if (order != 0)
1648 drain_all_pages();
1649
1650 if (likely(*did_some_progress))
1651 page = get_page_from_freelist(gfp_mask, nodemask, order,
1652 zonelist, high_zoneidx,
1653 alloc_flags, preferred_zone,
1654 migratetype);
1655 return page;
1656}
1657
1463/* 1658/*
1464 * This is the 'heart' of the zoned buddy allocator. 1659 * This is called in the allocator slow-path if the allocation request is of
1660 * sufficient urgency to ignore watermarks and take other desperate measures
1465 */ 1661 */
1466struct page * 1662static inline struct page *
1467__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1663__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1468 struct zonelist *zonelist, nodemask_t *nodemask) 1664 struct zonelist *zonelist, enum zone_type high_zoneidx,
1665 nodemask_t *nodemask, struct zone *preferred_zone,
1666 int migratetype)
1667{
1668 struct page *page;
1669
1670 do {
1671 page = get_page_from_freelist(gfp_mask, nodemask, order,
1672 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1673 preferred_zone, migratetype);
1674
1675 if (!page && gfp_mask & __GFP_NOFAIL)
1676 congestion_wait(BLK_RW_ASYNC, HZ/50);
1677 } while (!page && (gfp_mask & __GFP_NOFAIL));
1678
1679 return page;
1680}
1681
1682static inline
1683void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1684 enum zone_type high_zoneidx)
1469{ 1685{
1470 const gfp_t wait = gfp_mask & __GFP_WAIT;
1471 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1472 struct zoneref *z; 1686 struct zoneref *z;
1473 struct zone *zone; 1687 struct zone *zone;
1474 struct page *page;
1475 struct reclaim_state reclaim_state;
1476 struct task_struct *p = current;
1477 int do_retry;
1478 int alloc_flags;
1479 unsigned long did_some_progress;
1480 unsigned long pages_reclaimed = 0;
1481 1688
1482 might_sleep_if(wait); 1689 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1690 wakeup_kswapd(zone, order);
1691}
1483 1692
1484 if (should_fail_alloc_page(gfp_mask, order)) 1693static inline int
1485 return NULL; 1694gfp_to_alloc_flags(gfp_t gfp_mask)
1695{
1696 struct task_struct *p = current;
1697 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1698 const gfp_t wait = gfp_mask & __GFP_WAIT;
1486 1699
1487restart: 1700 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1488 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1701 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1489 1702
1490 if (unlikely(!z->zone)) { 1703 /*
1704 * The caller may dip into page reserves a bit more if the caller
1705 * cannot run direct reclaim, or if the caller has realtime scheduling
1706 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1707 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1708 */
1709 alloc_flags |= (gfp_mask & __GFP_HIGH);
1710
1711 if (!wait) {
1712 alloc_flags |= ALLOC_HARDER;
1491 /* 1713 /*
1492 * Happens if we have an empty zonelist as a result of 1714 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1493 * GFP_THISNODE being used on a memoryless node 1715 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1494 */ 1716 */
1495 return NULL; 1717 alloc_flags &= ~ALLOC_CPUSET;
1718 } else if (unlikely(rt_task(p)))
1719 alloc_flags |= ALLOC_HARDER;
1720
1721 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1722 if (!in_interrupt() &&
1723 ((p->flags & PF_MEMALLOC) ||
1724 unlikely(test_thread_flag(TIF_MEMDIE))))
1725 alloc_flags |= ALLOC_NO_WATERMARKS;
1496 } 1726 }
1497 1727
1498 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1728 return alloc_flags;
1499 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1729}
1500 if (page) 1730
1501 goto got_pg; 1731static inline struct page *
1732__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1733 struct zonelist *zonelist, enum zone_type high_zoneidx,
1734 nodemask_t *nodemask, struct zone *preferred_zone,
1735 int migratetype)
1736{
1737 const gfp_t wait = gfp_mask & __GFP_WAIT;
1738 struct page *page = NULL;
1739 int alloc_flags;
1740 unsigned long pages_reclaimed = 0;
1741 unsigned long did_some_progress;
1742 struct task_struct *p = current;
1743
1744 /*
1745 * In the slowpath, we sanity check order to avoid ever trying to
1746 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1747 * be using allocators in order of preference for an area that is
1748 * too large.
1749 */
1750 if (order >= MAX_ORDER) {
1751 WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
1752 return NULL;
1753 }
1502 1754
1503 /* 1755 /*
1504 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1756 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1511,151 +1763,88 @@ restart:
1511 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1763 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1512 goto nopage; 1764 goto nopage;
1513 1765
1514 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1766 wake_all_kswapd(order, zonelist, high_zoneidx);
1515 wakeup_kswapd(zone, order);
1516 1767
1517 /* 1768 /*
1518 * OK, we're below the kswapd watermark and have kicked background 1769 * OK, we're below the kswapd watermark and have kicked background
1519 * reclaim. Now things get more complex, so set up alloc_flags according 1770 * reclaim. Now things get more complex, so set up alloc_flags according
1520 * to how we want to proceed. 1771 * to how we want to proceed.
1521 *
1522 * The caller may dip into page reserves a bit more if the caller
1523 * cannot run direct reclaim, or if the caller has realtime scheduling
1524 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1525 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1526 */ 1772 */
1527 alloc_flags = ALLOC_WMARK_MIN; 1773 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1528 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1529 alloc_flags |= ALLOC_HARDER;
1530 if (gfp_mask & __GFP_HIGH)
1531 alloc_flags |= ALLOC_HIGH;
1532 if (wait)
1533 alloc_flags |= ALLOC_CPUSET;
1534 1774
1535 /* 1775restart:
1536 * Go through the zonelist again. Let __GFP_HIGH and allocations 1776 /* This is the last chance, in general, before the goto nopage. */
1537 * coming from realtime tasks go deeper into reserves.
1538 *
1539 * This is the last chance, in general, before the goto nopage.
1540 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1541 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1542 */
1543 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1777 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1544 high_zoneidx, alloc_flags); 1778 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1779 preferred_zone, migratetype);
1545 if (page) 1780 if (page)
1546 goto got_pg; 1781 goto got_pg;
1547 1782
1548 /* This allocation should allow future memory freeing. */
1549
1550rebalance: 1783rebalance:
1551 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1784 /* Allocate without watermarks if the context allows */
1552 && !in_interrupt()) { 1785 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1553 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1786 page = __alloc_pages_high_priority(gfp_mask, order,
1554nofail_alloc: 1787 zonelist, high_zoneidx, nodemask,
1555 /* go through the zonelist yet again, ignoring mins */ 1788 preferred_zone, migratetype);
1556 page = get_page_from_freelist(gfp_mask, nodemask, order, 1789 if (page)
1557 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1790 goto got_pg;
1558 if (page)
1559 goto got_pg;
1560 if (gfp_mask & __GFP_NOFAIL) {
1561 congestion_wait(WRITE, HZ/50);
1562 goto nofail_alloc;
1563 }
1564 }
1565 goto nopage;
1566 } 1791 }
1567 1792
1568 /* Atomic allocations - we can't balance anything */ 1793 /* Atomic allocations - we can't balance anything */
1569 if (!wait) 1794 if (!wait)
1570 goto nopage; 1795 goto nopage;
1571 1796
1572 cond_resched(); 1797 /* Avoid recursion of direct reclaim */
1573 1798 if (p->flags & PF_MEMALLOC)
1574 /* We now go into synchronous reclaim */ 1799 goto nopage;
1575 cpuset_memory_pressure_bump();
1576 /*
1577 * The task's cpuset might have expanded its set of allowable nodes
1578 */
1579 cpuset_update_task_memory_state();
1580 p->flags |= PF_MEMALLOC;
1581 reclaim_state.reclaimed_slab = 0;
1582 p->reclaim_state = &reclaim_state;
1583 1800
1584 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1801 /* Avoid allocations with no watermarks from looping endlessly */
1802 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
1803 goto nopage;
1585 1804
1586 p->reclaim_state = NULL; 1805 /* Try direct reclaim and then allocating */
1587 p->flags &= ~PF_MEMALLOC; 1806 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1807 zonelist, high_zoneidx,
1808 nodemask,
1809 alloc_flags, preferred_zone,
1810 migratetype, &did_some_progress);
1811 if (page)
1812 goto got_pg;
1588 1813
1589 cond_resched(); 1814 /*
1815 * If we failed to make any progress reclaiming, then we are
1816 * running out of options and have to consider going OOM
1817 */
1818 if (!did_some_progress) {
1819 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1820 if (oom_killer_disabled)
1821 goto nopage;
1822 page = __alloc_pages_may_oom(gfp_mask, order,
1823 zonelist, high_zoneidx,
1824 nodemask, preferred_zone,
1825 migratetype);
1826 if (page)
1827 goto got_pg;
1590 1828
1591 if (order != 0) 1829 /*
1592 drain_all_pages(); 1830 * The OOM killer does not trigger for high-order
1831 * ~__GFP_NOFAIL allocations so if no progress is being
1832 * made, there are no other options and retrying is
1833 * unlikely to help.
1834 */
1835 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1836 !(gfp_mask & __GFP_NOFAIL))
1837 goto nopage;
1593 1838
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1839 goto restart;
1603 } 1840 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1841 }
1629 1842
1630 /* 1843 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1844 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1845 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1846 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) { 1847 congestion_wait(BLK_RW_ASYNC, HZ/50);
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1848 goto rebalance;
1660 } 1849 }
1661 1850
@@ -1667,10 +1856,60 @@ nopage:
1667 dump_stack(); 1856 dump_stack();
1668 show_mem(); 1857 show_mem();
1669 } 1858 }
1859 return page;
1670got_pg: 1860got_pg:
1861 if (kmemcheck_enabled)
1862 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1671 return page; 1863 return page;
1864
1672} 1865}
1673EXPORT_SYMBOL(__alloc_pages_internal); 1866
1867/*
1868 * This is the 'heart' of the zoned buddy allocator.
1869 */
1870struct page *
1871__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1872 struct zonelist *zonelist, nodemask_t *nodemask)
1873{
1874 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1875 struct zone *preferred_zone;
1876 struct page *page;
1877 int migratetype = allocflags_to_migratetype(gfp_mask);
1878
1879 gfp_mask &= gfp_allowed_mask;
1880
1881 lockdep_trace_alloc(gfp_mask);
1882
1883 might_sleep_if(gfp_mask & __GFP_WAIT);
1884
1885 if (should_fail_alloc_page(gfp_mask, order))
1886 return NULL;
1887
1888 /*
1889 * Check the zones suitable for the gfp_mask contain at least one
1890 * valid zone. It's possible to have an empty zonelist as a result
1891 * of GFP_THISNODE and a memoryless node
1892 */
1893 if (unlikely(!zonelist->_zonerefs->zone))
1894 return NULL;
1895
1896 /* The preferred zone is used for statistics later */
1897 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1898 if (!preferred_zone)
1899 return NULL;
1900
1901 /* First allocation attempt */
1902 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1903 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1904 preferred_zone, migratetype);
1905 if (unlikely(!page))
1906 page = __alloc_pages_slowpath(gfp_mask, order,
1907 zonelist, high_zoneidx, nodemask,
1908 preferred_zone, migratetype);
1909
1910 return page;
1911}
1912EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1913
1675/* 1914/*
1676 * Common helper functions. 1915 * Common helper functions.
@@ -1757,7 +1996,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
1757 unsigned long alloc_end = addr + (PAGE_SIZE << order); 1996 unsigned long alloc_end = addr + (PAGE_SIZE << order);
1758 unsigned long used = addr + PAGE_ALIGN(size); 1997 unsigned long used = addr + PAGE_ALIGN(size);
1759 1998
1760 split_page(virt_to_page(addr), order); 1999 split_page(virt_to_page((void *)addr), order);
1761 while (used < alloc_end) { 2000 while (used < alloc_end) {
1762 free_page(used); 2001 free_page(used);
1763 used += PAGE_SIZE; 2002 used += PAGE_SIZE;
@@ -1799,7 +2038,7 @@ static unsigned int nr_free_zone_pages(int offset)
1799 2038
1800 for_each_zone_zonelist(zone, z, zonelist, offset) { 2039 for_each_zone_zonelist(zone, z, zonelist, offset) {
1801 unsigned long size = zone->present_pages; 2040 unsigned long size = zone->present_pages;
1802 unsigned long high = zone->pages_high; 2041 unsigned long high = high_wmark_pages(zone);
1803 if (size > high) 2042 if (size > high)
1804 sum += size - high; 2043 sum += size - high;
1805 } 2044 }
@@ -1874,10 +2113,7 @@ void show_free_areas(void)
1874 int cpu; 2113 int cpu;
1875 struct zone *zone; 2114 struct zone *zone;
1876 2115
1877 for_each_zone(zone) { 2116 for_each_populated_zone(zone) {
1878 if (!populated_zone(zone))
1879 continue;
1880
1881 show_node(zone); 2117 show_node(zone);
1882 printk("%s per-cpu:\n", zone->name); 2118 printk("%s per-cpu:\n", zone->name);
1883 2119
@@ -1894,19 +2130,14 @@ void show_free_areas(void)
1894 2130
1895 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2131 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1896 " inactive_file:%lu" 2132 " inactive_file:%lu"
1897//TODO: check/adjust line lengths
1898#ifdef CONFIG_UNEVICTABLE_LRU
1899 " unevictable:%lu" 2133 " unevictable:%lu"
1900#endif
1901 " dirty:%lu writeback:%lu unstable:%lu\n" 2134 " dirty:%lu writeback:%lu unstable:%lu\n"
1902 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2135 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1903 global_page_state(NR_ACTIVE_ANON), 2136 global_page_state(NR_ACTIVE_ANON),
1904 global_page_state(NR_ACTIVE_FILE), 2137 global_page_state(NR_ACTIVE_FILE),
1905 global_page_state(NR_INACTIVE_ANON), 2138 global_page_state(NR_INACTIVE_ANON),
1906 global_page_state(NR_INACTIVE_FILE), 2139 global_page_state(NR_INACTIVE_FILE),
1907#ifdef CONFIG_UNEVICTABLE_LRU
1908 global_page_state(NR_UNEVICTABLE), 2140 global_page_state(NR_UNEVICTABLE),
1909#endif
1910 global_page_state(NR_FILE_DIRTY), 2141 global_page_state(NR_FILE_DIRTY),
1911 global_page_state(NR_WRITEBACK), 2142 global_page_state(NR_WRITEBACK),
1912 global_page_state(NR_UNSTABLE_NFS), 2143 global_page_state(NR_UNSTABLE_NFS),
@@ -1917,12 +2148,9 @@ void show_free_areas(void)
1917 global_page_state(NR_PAGETABLE), 2148 global_page_state(NR_PAGETABLE),
1918 global_page_state(NR_BOUNCE)); 2149 global_page_state(NR_BOUNCE));
1919 2150
1920 for_each_zone(zone) { 2151 for_each_populated_zone(zone) {
1921 int i; 2152 int i;
1922 2153
1923 if (!populated_zone(zone))
1924 continue;
1925
1926 show_node(zone); 2154 show_node(zone);
1927 printk("%s" 2155 printk("%s"
1928 " free:%lukB" 2156 " free:%lukB"
@@ -1933,25 +2161,21 @@ void show_free_areas(void)
1933 " inactive_anon:%lukB" 2161 " inactive_anon:%lukB"
1934 " active_file:%lukB" 2162 " active_file:%lukB"
1935 " inactive_file:%lukB" 2163 " inactive_file:%lukB"
1936#ifdef CONFIG_UNEVICTABLE_LRU
1937 " unevictable:%lukB" 2164 " unevictable:%lukB"
1938#endif
1939 " present:%lukB" 2165 " present:%lukB"
1940 " pages_scanned:%lu" 2166 " pages_scanned:%lu"
1941 " all_unreclaimable? %s" 2167 " all_unreclaimable? %s"
1942 "\n", 2168 "\n",
1943 zone->name, 2169 zone->name,
1944 K(zone_page_state(zone, NR_FREE_PAGES)), 2170 K(zone_page_state(zone, NR_FREE_PAGES)),
1945 K(zone->pages_min), 2171 K(min_wmark_pages(zone)),
1946 K(zone->pages_low), 2172 K(low_wmark_pages(zone)),
1947 K(zone->pages_high), 2173 K(high_wmark_pages(zone)),
1948 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2174 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1949 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2175 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1950 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2176 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1951 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2177 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1952#ifdef CONFIG_UNEVICTABLE_LRU
1953 K(zone_page_state(zone, NR_UNEVICTABLE)), 2178 K(zone_page_state(zone, NR_UNEVICTABLE)),
1954#endif
1955 K(zone->present_pages), 2179 K(zone->present_pages),
1956 zone->pages_scanned, 2180 zone->pages_scanned,
1957 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2181 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -1962,12 +2186,9 @@ void show_free_areas(void)
1962 printk("\n"); 2186 printk("\n");
1963 } 2187 }
1964 2188
1965 for_each_zone(zone) { 2189 for_each_populated_zone(zone) {
1966 unsigned long nr[MAX_ORDER], flags, order, total = 0; 2190 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1967 2191
1968 if (!populated_zone(zone))
1969 continue;
1970
1971 show_node(zone); 2192 show_node(zone);
1972 printk("%s: ", zone->name); 2193 printk("%s: ", zone->name);
1973 2194
@@ -2112,7 +2333,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2112} 2333}
2113 2334
2114 2335
2115#define MAX_NODE_LOAD (num_online_nodes()) 2336#define MAX_NODE_LOAD (nr_online_nodes)
2116static int node_load[MAX_NUMNODES]; 2337static int node_load[MAX_NUMNODES];
2117 2338
2118/** 2339/**
@@ -2134,7 +2355,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2134 int n, val; 2355 int n, val;
2135 int min_val = INT_MAX; 2356 int min_val = INT_MAX;
2136 int best_node = -1; 2357 int best_node = -1;
2137 node_to_cpumask_ptr(tmp, 0); 2358 const struct cpumask *tmp = cpumask_of_node(0);
2138 2359
2139 /* Use the local node if we haven't already */ 2360 /* Use the local node if we haven't already */
2140 if (!node_isset(node, *used_node_mask)) { 2361 if (!node_isset(node, *used_node_mask)) {
@@ -2155,8 +2376,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2155 val += (n < node); 2376 val += (n < node);
2156 2377
2157 /* Give preference to headless and unused nodes */ 2378 /* Give preference to headless and unused nodes */
2158 node_to_cpumask_ptr_next(tmp, n); 2379 tmp = cpumask_of_node(n);
2159 if (!cpus_empty(*tmp)) 2380 if (!cpumask_empty(tmp))
2160 val += PENALTY_FOR_NODE_WITH_CPUS; 2381 val += PENALTY_FOR_NODE_WITH_CPUS;
2161 2382
2162 /* Slight preference for less loaded node */ 2383 /* Slight preference for less loaded node */
@@ -2321,11 +2542,10 @@ static void build_zonelists(pg_data_t *pgdat)
2321 2542
2322 /* NUMA-aware ordering of nodes */ 2543 /* NUMA-aware ordering of nodes */
2323 local_node = pgdat->node_id; 2544 local_node = pgdat->node_id;
2324 load = num_online_nodes(); 2545 load = nr_online_nodes;
2325 prev_node = local_node; 2546 prev_node = local_node;
2326 nodes_clear(used_mask); 2547 nodes_clear(used_mask);
2327 2548
2328 memset(node_load, 0, sizeof(node_load));
2329 memset(node_order, 0, sizeof(node_order)); 2549 memset(node_order, 0, sizeof(node_order));
2330 j = 0; 2550 j = 0;
2331 2551
@@ -2434,6 +2654,9 @@ static int __build_all_zonelists(void *dummy)
2434{ 2654{
2435 int nid; 2655 int nid;
2436 2656
2657#ifdef CONFIG_NUMA
2658 memset(node_load, 0, sizeof(node_load));
2659#endif
2437 for_each_online_node(nid) { 2660 for_each_online_node(nid) {
2438 pg_data_t *pgdat = NODE_DATA(nid); 2661 pg_data_t *pgdat = NODE_DATA(nid);
2439 2662
@@ -2472,7 +2695,7 @@ void build_all_zonelists(void)
2472 2695
2473 printk("Built %i zonelists in %s order, mobility grouping %s. " 2696 printk("Built %i zonelists in %s order, mobility grouping %s. "
2474 "Total pages: %ld\n", 2697 "Total pages: %ld\n",
2475 num_online_nodes(), 2698 nr_online_nodes,
2476 zonelist_order_name[current_zonelist_order], 2699 zonelist_order_name[current_zonelist_order],
2477 page_group_by_mobility_disabled ? "off" : "on", 2700 page_group_by_mobility_disabled ? "off" : "on",
2478 vm_total_pages); 2701 vm_total_pages);
@@ -2551,8 +2774,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2551 2774
2552/* 2775/*
2553 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2776 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2554 * of blocks reserved is based on zone->pages_min. The memory within the 2777 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2555 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2778 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2556 * higher will lead to a bigger reserve which will get freed as contiguous 2779 * higher will lead to a bigger reserve which will get freed as contiguous
2557 * blocks as reclaim kicks in 2780 * blocks as reclaim kicks in
2558 */ 2781 */
@@ -2565,7 +2788,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2565 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2788 /* Get the start pfn, end pfn and the number of blocks to reserve */
2566 start_pfn = zone->zone_start_pfn; 2789 start_pfn = zone->zone_start_pfn;
2567 end_pfn = start_pfn + zone->spanned_pages; 2790 end_pfn = start_pfn + zone->spanned_pages;
2568 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2791 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2569 pageblock_order; 2792 pageblock_order;
2570 2793
2571 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2794 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -2687,6 +2910,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
2687 2910
2688static int zone_batchsize(struct zone *zone) 2911static int zone_batchsize(struct zone *zone)
2689{ 2912{
2913#ifdef CONFIG_MMU
2690 int batch; 2914 int batch;
2691 2915
2692 /* 2916 /*
@@ -2712,9 +2936,26 @@ static int zone_batchsize(struct zone *zone)
2712 * of pages of one half of the possible page colors 2936 * of pages of one half of the possible page colors
2713 * and the other with pages of the other colors. 2937 * and the other with pages of the other colors.
2714 */ 2938 */
2715 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2939 batch = rounddown_pow_of_two(batch + batch/2) - 1;
2716 2940
2717 return batch; 2941 return batch;
2942
2943#else
2944 /* The deferral and batching of frees should be suppressed under NOMMU
2945 * conditions.
2946 *
2947 * The problem is that NOMMU needs to be able to allocate large chunks
2948 * of contiguous memory as there's no hardware page translation to
2949 * assemble apparent contiguous memory from discontiguous pages.
2950 *
2951 * Queueing large contiguous runs of pages for batching, however,
2952 * causes the pages to actually be freed in smaller chunks. As there
2953 * can be a significant delay between the individual batches being
2954 * recycled, this leads to the once large chunks of space being
2955 * fragmented and becoming unavailable for high-order allocations.
2956 */
2957 return 0;
2958#endif
2718} 2959}
2719 2960
2720static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2961static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
@@ -2779,11 +3020,7 @@ static int __cpuinit process_zones(int cpu)
2779 3020
2780 node_set_state(node, N_CPU); /* this node has a cpu */ 3021 node_set_state(node, N_CPU); /* this node has a cpu */
2781 3022
2782 for_each_zone(zone) { 3023 for_each_populated_zone(zone) {
2783
2784 if (!populated_zone(zone))
2785 continue;
2786
2787 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 3024 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2788 GFP_KERNEL, node); 3025 GFP_KERNEL, node);
2789 if (!zone_pcp(zone, cpu)) 3026 if (!zone_pcp(zone, cpu))
@@ -2804,7 +3041,7 @@ bad:
2804 if (dzone == zone) 3041 if (dzone == zone)
2805 break; 3042 break;
2806 kfree(zone_pcp(dzone, cpu)); 3043 kfree(zone_pcp(dzone, cpu));
2807 zone_pcp(dzone, cpu) = NULL; 3044 zone_pcp(dzone, cpu) = &boot_pageset[cpu];
2808 } 3045 }
2809 return -ENOMEM; 3046 return -ENOMEM;
2810} 3047}
@@ -2819,7 +3056,7 @@ static inline void free_zone_pagesets(int cpu)
2819 /* Free per_cpu_pageset if it is slab allocated */ 3056 /* Free per_cpu_pageset if it is slab allocated */
2820 if (pset != &boot_pageset[cpu]) 3057 if (pset != &boot_pageset[cpu])
2821 kfree(pset); 3058 kfree(pset);
2822 zone_pcp(zone, cpu) = NULL; 3059 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2823 } 3060 }
2824} 3061}
2825 3062
@@ -3095,64 +3332,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
3095} 3332}
3096 3333
3097/** 3334/**
3098 * push_node_boundaries - Push node boundaries to at least the requested boundary
3099 * @nid: The nid of the node to push the boundary for
3100 * @start_pfn: The start pfn of the node
3101 * @end_pfn: The end pfn of the node
3102 *
3103 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
3104 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
3105 * be hotplugged even though no physical memory exists. This function allows
3106 * an arch to push out the node boundaries so mem_map is allocated that can
3107 * be used later.
3108 */
3109#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3110void __init push_node_boundaries(unsigned int nid,
3111 unsigned long start_pfn, unsigned long end_pfn)
3112{
3113 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3114 "Entering push_node_boundaries(%u, %lu, %lu)\n",
3115 nid, start_pfn, end_pfn);
3116
3117 /* Initialise the boundary for this node if necessary */
3118 if (node_boundary_end_pfn[nid] == 0)
3119 node_boundary_start_pfn[nid] = -1UL;
3120
3121 /* Update the boundaries */
3122 if (node_boundary_start_pfn[nid] > start_pfn)
3123 node_boundary_start_pfn[nid] = start_pfn;
3124 if (node_boundary_end_pfn[nid] < end_pfn)
3125 node_boundary_end_pfn[nid] = end_pfn;
3126}
3127
3128/* If necessary, push the node boundary out for reserve hotadd */
3129static void __meminit account_node_boundary(unsigned int nid,
3130 unsigned long *start_pfn, unsigned long *end_pfn)
3131{
3132 mminit_dprintk(MMINIT_TRACE, "zoneboundary",
3133 "Entering account_node_boundary(%u, %lu, %lu)\n",
3134 nid, *start_pfn, *end_pfn);
3135
3136 /* Return if boundary information has not been provided */
3137 if (node_boundary_end_pfn[nid] == 0)
3138 return;
3139
3140 /* Check the boundaries and update if necessary */
3141 if (node_boundary_start_pfn[nid] < *start_pfn)
3142 *start_pfn = node_boundary_start_pfn[nid];
3143 if (node_boundary_end_pfn[nid] > *end_pfn)
3144 *end_pfn = node_boundary_end_pfn[nid];
3145}
3146#else
3147void __init push_node_boundaries(unsigned int nid,
3148 unsigned long start_pfn, unsigned long end_pfn) {}
3149
3150static void __meminit account_node_boundary(unsigned int nid,
3151 unsigned long *start_pfn, unsigned long *end_pfn) {}
3152#endif
3153
3154
3155/**
3156 * get_pfn_range_for_nid - Return the start and end page frames for a node 3335 * get_pfn_range_for_nid - Return the start and end page frames for a node
3157 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 3336 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
3158 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 3337 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
@@ -3177,9 +3356,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
3177 3356
3178 if (*start_pfn == -1UL) 3357 if (*start_pfn == -1UL)
3179 *start_pfn = 0; 3358 *start_pfn = 0;
3180
3181 /* Push the node boundaries out if requested */
3182 account_node_boundary(nid, start_pfn, end_pfn);
3183} 3359}
3184 3360
3185/* 3361/*
@@ -3544,7 +3720,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3544 zone_pcp_init(zone); 3720 zone_pcp_init(zone);
3545 for_each_lru(l) { 3721 for_each_lru(l) {
3546 INIT_LIST_HEAD(&zone->lru[l].list); 3722 INIT_LIST_HEAD(&zone->lru[l].list);
3547 zone->lru[l].nr_scan = 0; 3723 zone->lru[l].nr_saved_scan = 0;
3548 } 3724 }
3549 zone->reclaim_stat.recent_rotated[0] = 0; 3725 zone->reclaim_stat.recent_rotated[0] = 0;
3550 zone->reclaim_stat.recent_rotated[1] = 0; 3726 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -3785,10 +3961,6 @@ void __init remove_all_active_ranges(void)
3785{ 3961{
3786 memset(early_node_map, 0, sizeof(early_node_map)); 3962 memset(early_node_map, 0, sizeof(early_node_map));
3787 nr_nodemap_entries = 0; 3963 nr_nodemap_entries = 0;
3788#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
3789 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
3790 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
3791#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
3792} 3964}
3793 3965
3794/* Compare two active node_active_regions */ 3966/* Compare two active node_active_regions */
@@ -3875,6 +4047,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3875 int i, nid; 4047 int i, nid;
3876 unsigned long usable_startpfn; 4048 unsigned long usable_startpfn;
3877 unsigned long kernelcore_node, kernelcore_remaining; 4049 unsigned long kernelcore_node, kernelcore_remaining;
4050 /* save the state before borrow the nodemask */
4051 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
3878 unsigned long totalpages = early_calculate_totalpages(); 4052 unsigned long totalpages = early_calculate_totalpages();
3879 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4053 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
3880 4054
@@ -3902,7 +4076,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3902 4076
3903 /* If kernelcore was not specified, there is no ZONE_MOVABLE */ 4077 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3904 if (!required_kernelcore) 4078 if (!required_kernelcore)
3905 return; 4079 goto out;
3906 4080
3907 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ 4081 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3908 find_usable_zone_for_movable(); 4082 find_usable_zone_for_movable();
@@ -4001,6 +4175,10 @@ restart:
4001 for (nid = 0; nid < MAX_NUMNODES; nid++) 4175 for (nid = 0; nid < MAX_NUMNODES; nid++)
4002 zone_movable_pfn[nid] = 4176 zone_movable_pfn[nid] =
4003 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); 4177 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
4178
4179out:
4180 /* restore the node_state */
4181 node_states[N_HIGH_MEMORY] = saved_node_state;
4004} 4182}
4005 4183
4006/* Any regular memory on that node ? */ 4184/* Any regular memory on that node ? */
@@ -4219,8 +4397,8 @@ static void calculate_totalreserve_pages(void)
4219 max = zone->lowmem_reserve[j]; 4397 max = zone->lowmem_reserve[j];
4220 } 4398 }
4221 4399
4222 /* we treat pages_high as reserved pages. */ 4400 /* we treat the high watermark as reserved pages. */
4223 max += zone->pages_high; 4401 max += high_wmark_pages(zone);
4224 4402
4225 if (max > zone->present_pages) 4403 if (max > zone->present_pages)
4226 max = zone->present_pages; 4404 max = zone->present_pages;
@@ -4270,12 +4448,13 @@ static void setup_per_zone_lowmem_reserve(void)
4270} 4448}
4271 4449
4272/** 4450/**
4273 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4451 * setup_per_zone_wmarks - called when min_free_kbytes changes
4452 * or when memory is hot-{added|removed}
4274 * 4453 *
4275 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4454 * Ensures that the watermark[min,low,high] values for each zone are set
4276 * with respect to min_free_kbytes. 4455 * correctly with respect to min_free_kbytes.
4277 */ 4456 */
4278void setup_per_zone_pages_min(void) 4457void setup_per_zone_wmarks(void)
4279{ 4458{
4280 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4459 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4281 unsigned long lowmem_pages = 0; 4460 unsigned long lowmem_pages = 0;
@@ -4300,7 +4479,7 @@ void setup_per_zone_pages_min(void)
4300 * need highmem pages, so cap pages_min to a small 4479 * need highmem pages, so cap pages_min to a small
4301 * value here. 4480 * value here.
4302 * 4481 *
4303 * The (pages_high-pages_low) and (pages_low-pages_min) 4482 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4304 * deltas controls asynch page reclaim, and so should 4483 * deltas controls asynch page reclaim, and so should
4305 * not be capped for highmem. 4484 * not be capped for highmem.
4306 */ 4485 */
@@ -4311,17 +4490,17 @@ void setup_per_zone_pages_min(void)
4311 min_pages = SWAP_CLUSTER_MAX; 4490 min_pages = SWAP_CLUSTER_MAX;
4312 if (min_pages > 128) 4491 if (min_pages > 128)
4313 min_pages = 128; 4492 min_pages = 128;
4314 zone->pages_min = min_pages; 4493 zone->watermark[WMARK_MIN] = min_pages;
4315 } else { 4494 } else {
4316 /* 4495 /*
4317 * If it's a lowmem zone, reserve a number of pages 4496 * If it's a lowmem zone, reserve a number of pages
4318 * proportionate to the zone's size. 4497 * proportionate to the zone's size.
4319 */ 4498 */
4320 zone->pages_min = tmp; 4499 zone->watermark[WMARK_MIN] = tmp;
4321 } 4500 }
4322 4501
4323 zone->pages_low = zone->pages_min + (tmp >> 2); 4502 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4324 zone->pages_high = zone->pages_min + (tmp >> 1); 4503 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4325 setup_zone_migrate_reserve(zone); 4504 setup_zone_migrate_reserve(zone);
4326 spin_unlock_irqrestore(&zone->lock, flags); 4505 spin_unlock_irqrestore(&zone->lock, flags);
4327 } 4506 }
@@ -4331,8 +4510,6 @@ void setup_per_zone_pages_min(void)
4331} 4510}
4332 4511
4333/** 4512/**
4334 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4335 *
4336 * The inactive anon list should be small enough that the VM never has to 4513 * The inactive anon list should be small enough that the VM never has to
4337 * do too much work, but large enough that each inactive page has a chance 4514 * do too much work, but large enough that each inactive page has a chance
4338 * to be referenced again before it is swapped out. 4515 * to be referenced again before it is swapped out.
@@ -4353,21 +4530,26 @@ void setup_per_zone_pages_min(void)
4353 * 1TB 101 10GB 4530 * 1TB 101 10GB
4354 * 10TB 320 32GB 4531 * 10TB 320 32GB
4355 */ 4532 */
4356static void setup_per_zone_inactive_ratio(void) 4533void calculate_zone_inactive_ratio(struct zone *zone)
4357{ 4534{
4358 struct zone *zone; 4535 unsigned int gb, ratio;
4359 4536
4360 for_each_zone(zone) { 4537 /* Zone size in gigabytes */
4361 unsigned int gb, ratio; 4538 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4362 4539 if (gb)
4363 /* Zone size in gigabytes */
4364 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4365 ratio = int_sqrt(10 * gb); 4540 ratio = int_sqrt(10 * gb);
4366 if (!ratio) 4541 else
4367 ratio = 1; 4542 ratio = 1;
4368 4543
4369 zone->inactive_ratio = ratio; 4544 zone->inactive_ratio = ratio;
4370 } 4545}
4546
4547static void __init setup_per_zone_inactive_ratio(void)
4548{
4549 struct zone *zone;
4550
4551 for_each_zone(zone)
4552 calculate_zone_inactive_ratio(zone);
4371} 4553}
4372 4554
4373/* 4555/*
@@ -4394,7 +4576,7 @@ static void setup_per_zone_inactive_ratio(void)
4394 * 8192MB: 11584k 4576 * 8192MB: 11584k
4395 * 16384MB: 16384k 4577 * 16384MB: 16384k
4396 */ 4578 */
4397static int __init init_per_zone_pages_min(void) 4579static int __init init_per_zone_wmark_min(void)
4398{ 4580{
4399 unsigned long lowmem_kbytes; 4581 unsigned long lowmem_kbytes;
4400 4582
@@ -4405,12 +4587,12 @@ static int __init init_per_zone_pages_min(void)
4405 min_free_kbytes = 128; 4587 min_free_kbytes = 128;
4406 if (min_free_kbytes > 65536) 4588 if (min_free_kbytes > 65536)
4407 min_free_kbytes = 65536; 4589 min_free_kbytes = 65536;
4408 setup_per_zone_pages_min(); 4590 setup_per_zone_wmarks();
4409 setup_per_zone_lowmem_reserve(); 4591 setup_per_zone_lowmem_reserve();
4410 setup_per_zone_inactive_ratio(); 4592 setup_per_zone_inactive_ratio();
4411 return 0; 4593 return 0;
4412} 4594}
4413module_init(init_per_zone_pages_min) 4595module_init(init_per_zone_wmark_min)
4414 4596
4415/* 4597/*
4416 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4598 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4422,7 +4604,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4422{ 4604{
4423 proc_dointvec(table, write, file, buffer, length, ppos); 4605 proc_dointvec(table, write, file, buffer, length, ppos);
4424 if (write) 4606 if (write)
4425 setup_per_zone_pages_min(); 4607 setup_per_zone_wmarks();
4426 return 0; 4608 return 0;
4427} 4609}
4428 4610
@@ -4466,7 +4648,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4466 * whenever sysctl_lowmem_reserve_ratio changes. 4648 * whenever sysctl_lowmem_reserve_ratio changes.
4467 * 4649 *
4468 * The reserve ratio obviously has absolutely no relation with the 4650 * The reserve ratio obviously has absolutely no relation with the
4469 * pages_min watermarks. The lowmem reserve ratio can only make sense 4651 * minimum watermarks. The lowmem reserve ratio can only make sense
4470 * if in function of the boot time zone sizes. 4652 * if in function of the boot time zone sizes.
4471 */ 4653 */
4472int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4654int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4493,7 +4675,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
4493 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 4675 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
4494 if (!write || (ret == -EINVAL)) 4676 if (!write || (ret == -EINVAL))
4495 return ret; 4677 return ret;
4496 for_each_zone(zone) { 4678 for_each_populated_zone(zone) {
4497 for_each_online_cpu(cpu) { 4679 for_each_online_cpu(cpu) {
4498 unsigned long high; 4680 unsigned long high;
4499 high = zone->present_pages / percpu_pagelist_fraction; 4681 high = zone->present_pages / percpu_pagelist_fraction;
@@ -4573,22 +4755,14 @@ void *__init alloc_large_system_hash(const char *tablename,
4573 else if (hashdist) 4755 else if (hashdist)
4574 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4756 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4575 else { 4757 else {
4576 unsigned long order = get_order(size);
4577 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4578 /* 4758 /*
4579 * If bucketsize is not a power-of-two, we may free 4759 * If bucketsize is not a power-of-two, we may free
4580 * some pages at the end of hash table. 4760 * some pages at the end of hash table which
4761 * alloc_pages_exact() automatically does
4581 */ 4762 */
4582 if (table) { 4763 if (get_order(size) < MAX_ORDER) {
4583 unsigned long alloc_end = (unsigned long)table + 4764 table = alloc_pages_exact(size, GFP_ATOMIC);
4584 (PAGE_SIZE << order); 4765 kmemleak_alloc(table, size, 1, GFP_ATOMIC);
4585 unsigned long used = (unsigned long)table +
4586 PAGE_ALIGN(size);
4587 split_page(virt_to_page(table), order);
4588 while (used < alloc_end) {
4589 free_page(used);
4590 used += PAGE_SIZE;
4591 }
4592 } 4766 }
4593 } 4767 }
4594 } while (!table && size > PAGE_SIZE && --log2qty); 4768 } while (!table && size > PAGE_SIZE && --log2qty);